1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i8_stride7_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 23; SSE-NEXT: movdqa (%rdi), %xmm0 24; SSE-NEXT: movdqa (%rdx), %xmm1 25; SSE-NEXT: movdqa (%r8), %xmm2 26; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 27; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 28; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 29; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 30; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 31; SSE-NEXT: pxor %xmm1, %xmm1 32; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 33; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,7] 34; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] 35; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,0,3,4,5,6,7] 36; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] 37; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 38; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] 39; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] 40; SSE-NEXT: packuswb %xmm3, %xmm0 41; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 42; SSE-NEXT: pand %xmm3, %xmm0 43; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 44; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,1] 45; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 46; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 47; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 48; SSE-NEXT: packuswb %xmm1, %xmm2 49; SSE-NEXT: pandn %xmm2, %xmm3 50; SSE-NEXT: por %xmm0, %xmm3 51; SSE-NEXT: pextrw $6, %xmm2, %ecx 52; SSE-NEXT: movw %cx, 12(%rax) 53; SSE-NEXT: movq %xmm3, (%rax) 54; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 55; SSE-NEXT: movd %xmm0, 8(%rax) 56; SSE-NEXT: retq 57; 58; AVX-LABEL: store_i8_stride7_vf2: 59; AVX: # %bb.0: 60; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 61; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 62; AVX-NEXT: vmovdqa (%rdi), %xmm0 63; AVX-NEXT: vmovdqa (%rdx), %xmm1 64; AVX-NEXT: vmovdqa (%r8), %xmm2 65; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 66; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 67; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 68; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 69; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 70; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 71; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 72; AVX-NEXT: vpextrw $6, %xmm0, 12(%rax) 73; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) 74; AVX-NEXT: vmovq %xmm0, (%rax) 75; AVX-NEXT: retq 76; 77; AVX2-LABEL: store_i8_stride7_vf2: 78; AVX2: # %bb.0: 79; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 80; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 81; AVX2-NEXT: vmovdqa (%rdi), %xmm0 82; AVX2-NEXT: vmovdqa (%rdx), %xmm1 83; AVX2-NEXT: vmovdqa (%r8), %xmm2 84; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 85; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 86; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 87; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 88; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 89; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 90; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 91; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rax) 92; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rax) 93; AVX2-NEXT: vmovq %xmm0, (%rax) 94; AVX2-NEXT: retq 95; 96; AVX2-FP-LABEL: store_i8_stride7_vf2: 97; AVX2-FP: # %bb.0: 98; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 99; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 100; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 101; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 102; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 103; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 104; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 105; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 106; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 107; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 108; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 109; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 110; AVX2-FP-NEXT: vpextrw $6, %xmm0, 12(%rax) 111; AVX2-FP-NEXT: vpextrd $2, %xmm0, 8(%rax) 112; AVX2-FP-NEXT: vmovq %xmm0, (%rax) 113; AVX2-FP-NEXT: retq 114; 115; AVX2-FCP-LABEL: store_i8_stride7_vf2: 116; AVX2-FCP: # %bb.0: 117; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 118; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 119; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 120; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 121; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 122; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 123; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 124; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 125; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 126; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 127; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 128; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 129; AVX2-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) 130; AVX2-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) 131; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) 132; AVX2-FCP-NEXT: retq 133; 134; AVX512-LABEL: store_i8_stride7_vf2: 135; AVX512: # %bb.0: 136; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 137; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 138; AVX512-NEXT: vmovdqa (%rdi), %xmm0 139; AVX512-NEXT: vmovdqa (%rdx), %xmm1 140; AVX512-NEXT: vmovdqa (%r8), %xmm2 141; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 142; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 143; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 144; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 145; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 146; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 147; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 148; AVX512-NEXT: vpextrw $6, %xmm0, 12(%rax) 149; AVX512-NEXT: vpextrd $2, %xmm0, 8(%rax) 150; AVX512-NEXT: vmovq %xmm0, (%rax) 151; AVX512-NEXT: retq 152; 153; AVX512-FCP-LABEL: store_i8_stride7_vf2: 154; AVX512-FCP: # %bb.0: 155; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 156; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 157; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 158; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 159; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 160; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 161; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 162; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 163; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 164; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 165; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 166; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 167; AVX512-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) 168; AVX512-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) 169; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) 170; AVX512-FCP-NEXT: retq 171; 172; AVX512DQ-LABEL: store_i8_stride7_vf2: 173; AVX512DQ: # %bb.0: 174; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 175; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 176; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 177; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 178; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 179; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 180; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 181; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 182; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 183; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 184; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 185; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 186; AVX512DQ-NEXT: vpextrw $6, %xmm0, 12(%rax) 187; AVX512DQ-NEXT: vpextrd $2, %xmm0, 8(%rax) 188; AVX512DQ-NEXT: vmovq %xmm0, (%rax) 189; AVX512DQ-NEXT: retq 190; 191; AVX512DQ-FCP-LABEL: store_i8_stride7_vf2: 192; AVX512DQ-FCP: # %bb.0: 193; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 194; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 195; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 196; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 197; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 198; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 199; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 200; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 201; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 202; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 203; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 204; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 205; AVX512DQ-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) 206; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) 207; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) 208; AVX512DQ-FCP-NEXT: retq 209; 210; AVX512BW-LABEL: store_i8_stride7_vf2: 211; AVX512BW: # %bb.0: 212; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 213; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 214; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 215; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 216; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 217; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 218; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 219; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 220; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 221; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 222; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 223; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 224; AVX512BW-NEXT: vpextrw $6, %xmm0, 12(%rax) 225; AVX512BW-NEXT: vpextrd $2, %xmm0, 8(%rax) 226; AVX512BW-NEXT: vmovq %xmm0, (%rax) 227; AVX512BW-NEXT: retq 228; 229; AVX512BW-FCP-LABEL: store_i8_stride7_vf2: 230; AVX512BW-FCP: # %bb.0: 231; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 232; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 233; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 234; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 235; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 236; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 237; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 238; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 239; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 240; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 241; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 242; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 243; AVX512BW-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) 244; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) 245; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) 246; AVX512BW-FCP-NEXT: retq 247; 248; AVX512DQ-BW-LABEL: store_i8_stride7_vf2: 249; AVX512DQ-BW: # %bb.0: 250; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 251; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 252; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 253; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 254; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 255; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 256; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 257; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 258; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 259; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 260; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 261; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 262; AVX512DQ-BW-NEXT: vpextrw $6, %xmm0, 12(%rax) 263; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 8(%rax) 264; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) 265; AVX512DQ-BW-NEXT: retq 266; 267; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf2: 268; AVX512DQ-BW-FCP: # %bb.0: 269; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 270; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 271; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 272; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 273; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 274; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 275; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 276; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 277; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] 278; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 279; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 280; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] 281; AVX512DQ-BW-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) 282; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) 283; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) 284; AVX512DQ-BW-FCP-NEXT: retq 285 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 286 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 287 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 288 %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64 289 %in.vec4 = load <2 x i8>, ptr %in.vecptr4, align 64 290 %in.vec5 = load <2 x i8>, ptr %in.vecptr5, align 64 291 %in.vec6 = load <2 x i8>, ptr %in.vecptr6, align 64 292 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 293 %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 294 %3 = shufflevector <2 x i8> %in.vec4, <2 x i8> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 295 %4 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 296 %5 = shufflevector <2 x i8> %in.vec6, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 297 %6 = shufflevector <4 x i8> %3, <4 x i8> %5, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> 298 %7 = shufflevector <6 x i8> %6, <6 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef> 299 %8 = shufflevector <8 x i8> %4, <8 x i8> %7, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13> 300 %interleaved.vec = shufflevector <14 x i8> %8, <14 x i8> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13> 301 store <14 x i8> %interleaved.vec, ptr %out.vec, align 64 302 ret void 303} 304 305define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { 306; SSE-LABEL: store_i8_stride7_vf4: 307; SSE: # %bb.0: 308; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 309; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 310; SSE-NEXT: movdqa (%rdi), %xmm0 311; SSE-NEXT: movdqa (%rdx), %xmm3 312; SSE-NEXT: movdqa (%r8), %xmm5 313; SSE-NEXT: movdqa (%r10), %xmm2 314; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 315; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] 316; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] 317; SSE-NEXT: pxor %xmm7, %xmm7 318; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 319; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 320; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7] 321; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] 322; SSE-NEXT: packuswb %xmm4, %xmm6 323; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 324; SSE-NEXT: movdqa %xmm1, %xmm4 325; SSE-NEXT: pandn %xmm6, %xmm4 326; SSE-NEXT: movdqa %xmm0, %xmm8 327; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 328; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,0,0] 329; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 330; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,1,3] 331; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] 332; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,6] 333; SSE-NEXT: packuswb %xmm8, %xmm6 334; SSE-NEXT: pand %xmm1, %xmm6 335; SSE-NEXT: por %xmm4, %xmm6 336; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 337; SSE-NEXT: pand %xmm4, %xmm6 338; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 339; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,3] 340; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 341; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] 342; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] 343; SSE-NEXT: packuswb %xmm7, %xmm8 344; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] 345; SSE-NEXT: pand %xmm7, %xmm8 346; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7] 347; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] 348; SSE-NEXT: pandn %xmm9, %xmm7 349; SSE-NEXT: por %xmm8, %xmm7 350; SSE-NEXT: pandn %xmm7, %xmm4 351; SSE-NEXT: por %xmm6, %xmm4 352; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] 353; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,1,3,4,5,6,7] 354; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] 355; SSE-NEXT: packuswb %xmm6, %xmm5 356; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,1] 357; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 358; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] 359; SSE-NEXT: packuswb %xmm3, %xmm6 360; SSE-NEXT: pand %xmm1, %xmm6 361; SSE-NEXT: pandn %xmm5, %xmm1 362; SSE-NEXT: por %xmm6, %xmm1 363; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] 364; SSE-NEXT: pand %xmm3, %xmm1 365; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] 366; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 367; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 368; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 369; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 370; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] 371; SSE-NEXT: pand %xmm5, %xmm0 372; SSE-NEXT: pandn %xmm2, %xmm5 373; SSE-NEXT: por %xmm0, %xmm5 374; SSE-NEXT: pandn %xmm5, %xmm3 375; SSE-NEXT: por %xmm1, %xmm3 376; SSE-NEXT: movq %xmm3, 16(%rax) 377; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 378; SSE-NEXT: movd %xmm0, 24(%rax) 379; SSE-NEXT: movdqa %xmm4, (%rax) 380; SSE-NEXT: retq 381; 382; AVX-LABEL: store_i8_stride7_vf4: 383; AVX: # %bb.0: 384; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 385; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 386; AVX-NEXT: vmovdqa (%rdi), %xmm0 387; AVX-NEXT: vmovdqa (%rdx), %xmm1 388; AVX-NEXT: vmovdqa (%r8), %xmm2 389; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 390; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 391; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 392; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] 393; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] 394; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,zero,xmm0[1,5,9,13],zero,zero,zero,xmm0[2,6] 395; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4,8],zero,zero,zero,zero,xmm1[1,5,9],zero,zero 396; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 397; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,14],zero,zero,zero,xmm0[3,7,11,15],zero,zero,zero,xmm0[u,u,u,u] 398; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2,6,10],zero,zero,zero,zero,xmm1[3,7,11,u,u,u,u] 399; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 400; AVX-NEXT: vpextrd $2, %xmm0, 24(%rax) 401; AVX-NEXT: vmovq %xmm0, 16(%rax) 402; AVX-NEXT: vmovdqa %xmm2, (%rax) 403; AVX-NEXT: retq 404; 405; AVX2-LABEL: store_i8_stride7_vf4: 406; AVX2: # %bb.0: 407; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 408; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 409; AVX2-NEXT: vmovdqa (%rdi), %xmm0 410; AVX2-NEXT: vmovdqa (%rsi), %xmm1 411; AVX2-NEXT: vmovdqa (%rdx), %xmm2 412; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 413; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 414; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 415; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 416; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 417; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 418; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 419; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 420; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 421; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 422; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 423; AVX2-NEXT: vpextrd $2, %xmm1, 24(%rax) 424; AVX2-NEXT: vmovq %xmm1, 16(%rax) 425; AVX2-NEXT: vmovdqa %xmm0, (%rax) 426; AVX2-NEXT: vzeroupper 427; AVX2-NEXT: retq 428; 429; AVX2-FP-LABEL: store_i8_stride7_vf4: 430; AVX2-FP: # %bb.0: 431; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 432; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 433; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 434; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 435; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 436; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 437; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 438; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 439; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 440; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 441; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 442; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 443; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 444; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 445; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 446; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 447; AVX2-FP-NEXT: vpextrd $2, %xmm1, 24(%rax) 448; AVX2-FP-NEXT: vmovq %xmm1, 16(%rax) 449; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) 450; AVX2-FP-NEXT: vzeroupper 451; AVX2-FP-NEXT: retq 452; 453; AVX2-FCP-LABEL: store_i8_stride7_vf4: 454; AVX2-FCP: # %bb.0: 455; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 456; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 457; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 458; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 459; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 460; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 461; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 462; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 463; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 464; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 465; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 466; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 467; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 468; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 469; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 470; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 471; AVX2-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) 472; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rax) 473; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) 474; AVX2-FCP-NEXT: vzeroupper 475; AVX2-FCP-NEXT: retq 476; 477; AVX512-LABEL: store_i8_stride7_vf4: 478; AVX512: # %bb.0: 479; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 480; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 481; AVX512-NEXT: vmovdqa (%rdi), %xmm0 482; AVX512-NEXT: vmovdqa (%rsi), %xmm1 483; AVX512-NEXT: vmovdqa (%rdx), %xmm2 484; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 485; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 486; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 487; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 488; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 489; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 490; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] 491; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 492; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 493; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 494; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 495; AVX512-NEXT: vpextrd $2, %xmm1, 24(%rax) 496; AVX512-NEXT: vmovq %xmm1, 16(%rax) 497; AVX512-NEXT: vmovdqa %xmm0, (%rax) 498; AVX512-NEXT: vzeroupper 499; AVX512-NEXT: retq 500; 501; AVX512-FCP-LABEL: store_i8_stride7_vf4: 502; AVX512-FCP: # %bb.0: 503; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 504; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 505; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 506; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 507; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 508; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 509; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 510; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 511; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 512; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 513; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 514; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] 515; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 516; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 517; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 518; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 519; AVX512-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) 520; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rax) 521; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) 522; AVX512-FCP-NEXT: vzeroupper 523; AVX512-FCP-NEXT: retq 524; 525; AVX512DQ-LABEL: store_i8_stride7_vf4: 526; AVX512DQ: # %bb.0: 527; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 528; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 529; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 530; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 531; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 532; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 533; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 534; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 535; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 536; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 537; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 538; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] 539; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 540; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 541; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 542; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 543; AVX512DQ-NEXT: vpextrd $2, %xmm1, 24(%rax) 544; AVX512DQ-NEXT: vmovq %xmm1, 16(%rax) 545; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) 546; AVX512DQ-NEXT: vzeroupper 547; AVX512DQ-NEXT: retq 548; 549; AVX512DQ-FCP-LABEL: store_i8_stride7_vf4: 550; AVX512DQ-FCP: # %bb.0: 551; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 552; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 553; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 554; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 555; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 556; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 557; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 558; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 559; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 560; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 561; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 562; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] 563; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 564; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 565; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 566; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 567; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) 568; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rax) 569; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) 570; AVX512DQ-FCP-NEXT: vzeroupper 571; AVX512DQ-FCP-NEXT: retq 572; 573; AVX512BW-LABEL: store_i8_stride7_vf4: 574; AVX512BW: # %bb.0: 575; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 576; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 577; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 578; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 579; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 580; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 581; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 582; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 583; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 584; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 585; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 586; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 587; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 588; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 589; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 590; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 591; AVX512BW-NEXT: vpextrd $2, %xmm1, 24(%rax) 592; AVX512BW-NEXT: vmovq %xmm1, 16(%rax) 593; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) 594; AVX512BW-NEXT: vzeroupper 595; AVX512BW-NEXT: retq 596; 597; AVX512BW-FCP-LABEL: store_i8_stride7_vf4: 598; AVX512BW-FCP: # %bb.0: 599; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 600; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 601; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 602; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 603; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 604; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 605; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 606; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 607; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 608; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 609; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 610; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 611; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 612; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 613; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 614; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 615; AVX512BW-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) 616; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rax) 617; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 618; AVX512BW-FCP-NEXT: vzeroupper 619; AVX512BW-FCP-NEXT: retq 620; 621; AVX512DQ-BW-LABEL: store_i8_stride7_vf4: 622; AVX512DQ-BW: # %bb.0: 623; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 624; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 625; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 626; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 627; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 628; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 629; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 630; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 631; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 632; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 633; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 634; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 635; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 636; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 637; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 638; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 639; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, 24(%rax) 640; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rax) 641; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax) 642; AVX512DQ-BW-NEXT: vzeroupper 643; AVX512DQ-BW-NEXT: retq 644; 645; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf4: 646; AVX512DQ-BW-FCP: # %bb.0: 647; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 648; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 649; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 650; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 651; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 652; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 653; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 654; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 655; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 656; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 657; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 658; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 659; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 660; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero 661; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 662; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 663; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) 664; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rax) 665; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 666; AVX512DQ-BW-FCP-NEXT: vzeroupper 667; AVX512DQ-BW-FCP-NEXT: retq 668 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 669 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 670 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 671 %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64 672 %in.vec4 = load <4 x i8>, ptr %in.vecptr4, align 64 673 %in.vec5 = load <4 x i8>, ptr %in.vecptr5, align 64 674 %in.vec6 = load <4 x i8>, ptr %in.vecptr6, align 64 675 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 676 %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 677 %3 = shufflevector <4 x i8> %in.vec4, <4 x i8> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 678 %4 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 679 %5 = shufflevector <4 x i8> %in.vec6, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 680 %6 = shufflevector <8 x i8> %3, <8 x i8> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 681 %7 = shufflevector <12 x i8> %6, <12 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> 682 %8 = shufflevector <16 x i8> %4, <16 x i8> %7, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27> 683 %interleaved.vec = shufflevector <28 x i8> %8, <28 x i8> poison, <28 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27> 684 store <28 x i8> %interleaved.vec, ptr %out.vec, align 64 685 ret void 686} 687 688define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { 689; SSE-LABEL: store_i8_stride7_vf8: 690; SSE: # %bb.0: 691; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 692; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 693; SSE-NEXT: movq {{.*#+}} xmm10 = mem[0],zero 694; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 695; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 696; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 697; SSE-NEXT: movq {{.*#+}} xmm14 = mem[0],zero 698; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 699; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 700; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero 701; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] 702; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] 703; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 704; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 705; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 706; SSE-NEXT: pand %xmm6, %xmm0 707; SSE-NEXT: movdqa %xmm4, %xmm7 708; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 709; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] 710; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] 711; SSE-NEXT: pandn %xmm8, %xmm6 712; SSE-NEXT: por %xmm0, %xmm6 713; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 714; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] 715; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 716; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 717; SSE-NEXT: pand %xmm8, %xmm0 718; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 719; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm10[0,2,1,3,4,5,6,7] 720; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,0] 721; SSE-NEXT: movdqa %xmm8, %xmm12 722; SSE-NEXT: pandn %xmm11, %xmm12 723; SSE-NEXT: por %xmm0, %xmm12 724; SSE-NEXT: pand %xmm9, %xmm12 725; SSE-NEXT: pandn %xmm6, %xmm9 726; SSE-NEXT: por %xmm12, %xmm9 727; SSE-NEXT: pxor %xmm0, %xmm0 728; SSE-NEXT: movdqa %xmm5, %xmm12 729; SSE-NEXT: movdqa %xmm5, %xmm15 730; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] 731; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] 732; SSE-NEXT: movdqa %xmm12, %xmm13 733; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] 734; SSE-NEXT: movdqa %xmm13, %xmm0 735; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 736; SSE-NEXT: movdqa %xmm13, %xmm6 737; SSE-NEXT: packuswb %xmm0, %xmm6 738; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] 739; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] 740; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 741; SSE-NEXT: movdqa %xmm11, %xmm14 742; SSE-NEXT: pandn %xmm0, %xmm14 743; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3] 744; SSE-NEXT: pand %xmm11, %xmm0 745; SSE-NEXT: por %xmm0, %xmm14 746; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 747; SSE-NEXT: pand %xmm6, %xmm9 748; SSE-NEXT: pandn %xmm14, %xmm6 749; SSE-NEXT: por %xmm9, %xmm6 750; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] 751; SSE-NEXT: movdqa %xmm11, %xmm9 752; SSE-NEXT: pandn %xmm0, %xmm9 753; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] 754; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 755; SSE-NEXT: pand %xmm11, %xmm0 756; SSE-NEXT: por %xmm9, %xmm0 757; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 758; SSE-NEXT: movdqa %xmm1, %xmm9 759; SSE-NEXT: pandn %xmm0, %xmm9 760; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7] 761; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] 762; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 763; SSE-NEXT: movdqa %xmm0, %xmm14 764; SSE-NEXT: pandn %xmm2, %xmm14 765; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] 766; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 767; SSE-NEXT: pand %xmm0, %xmm2 768; SSE-NEXT: por %xmm2, %xmm14 769; SSE-NEXT: pand %xmm1, %xmm14 770; SSE-NEXT: por %xmm9, %xmm14 771; SSE-NEXT: movdqa %xmm15, %xmm2 772; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] 773; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] 774; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] 775; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] 776; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[0,1,2,2,4,5,6,7] 777; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 778; SSE-NEXT: packuswb %xmm2, %xmm9 779; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 780; SSE-NEXT: pand %xmm12, %xmm9 781; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 782; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] 783; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 784; SSE-NEXT: pandn %xmm2, %xmm12 785; SSE-NEXT: por %xmm9, %xmm12 786; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 787; SSE-NEXT: pand %xmm2, %xmm12 788; SSE-NEXT: pandn %xmm14, %xmm2 789; SSE-NEXT: por %xmm2, %xmm12 790; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] 791; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 792; SSE-NEXT: packuswb %xmm13, %xmm2 793; SSE-NEXT: pand %xmm0, %xmm2 794; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7] 795; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] 796; SSE-NEXT: pandn %xmm9, %xmm0 797; SSE-NEXT: por %xmm2, %xmm0 798; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2] 799; SSE-NEXT: movdqa %xmm3, %xmm10 800; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,1,2,3,4,5,6,7] 801; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] 802; SSE-NEXT: pand %xmm11, %xmm9 803; SSE-NEXT: pandn %xmm2, %xmm11 804; SSE-NEXT: por %xmm9, %xmm11 805; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 806; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,2,2,4,5,6,7] 807; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 808; SSE-NEXT: pand %xmm8, %xmm2 809; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,2,1] 810; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] 811; SSE-NEXT: pandn %xmm7, %xmm8 812; SSE-NEXT: por %xmm2, %xmm8 813; SSE-NEXT: pand %xmm1, %xmm8 814; SSE-NEXT: pandn %xmm11, %xmm1 815; SSE-NEXT: por %xmm8, %xmm1 816; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 817; SSE-NEXT: pand %xmm2, %xmm1 818; SSE-NEXT: pandn %xmm0, %xmm2 819; SSE-NEXT: por %xmm2, %xmm1 820; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 821; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] 822; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] 823; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 824; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] 825; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] 826; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] 827; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] 828; SSE-NEXT: pand %xmm0, %xmm4 829; SSE-NEXT: pandn %xmm2, %xmm0 830; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] 831; SSE-NEXT: por %xmm4, %xmm0 832; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7] 833; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] 834; SSE-NEXT: pand %xmm4, %xmm3 835; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 836; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 837; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] 838; SSE-NEXT: pandn %xmm2, %xmm4 839; SSE-NEXT: por %xmm3, %xmm4 840; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] 841; SSE-NEXT: pand %xmm2, %xmm0 842; SSE-NEXT: pandn %xmm4, %xmm2 843; SSE-NEXT: por %xmm0, %xmm2 844; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 845; SSE-NEXT: movq %xmm2, 48(%rax) 846; SSE-NEXT: movdqa %xmm1, 16(%rax) 847; SSE-NEXT: movdqa %xmm12, 32(%rax) 848; SSE-NEXT: movdqa %xmm6, (%rax) 849; SSE-NEXT: retq 850; 851; AVX-LABEL: store_i8_stride7_vf8: 852; AVX: # %bb.0: 853; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 854; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 855; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 856; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 857; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 858; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 859; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 860; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 861; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 862; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 863; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 864; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 865; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u],zero,zero,xmm2[5,13,u,u,u],zero,zero,xmm2[6,14,u,u] 866; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,5,13],zero,zero,xmm1[u,u,u,6,14],zero,zero,xmm1[u,u] 867; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 868; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,12],zero,xmm0[u,u,u,u,5,13],zero,xmm0[u,u,u,u,6,14] 869; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero 870; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 871; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 872; AVX-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4 873; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u],zero,zero,xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u] 874; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 875; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 876; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u] 877; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u] 878; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 879; AVX-NEXT: vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 880; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 881; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero 882; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10] 883; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 884; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,0,8],zero,xmm0[u,u,u,u,1,9],zero,xmm0[u,u] 885; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u],zero,zero,xmm3[0,u,u,u,u],zero,zero,xmm3[1,u,u] 886; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 887; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 888; AVX-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 889; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[u,u,u,3,11],zero,zero,xmm1[u,u,u,4,12],zero,zero 890; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,10,u,u,u],zero,zero,xmm2[3,11,u,u,u],zero,zero,xmm2[4,12] 891; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 892; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,10],zero,xmm0[u,u,u,u,3,11],zero,xmm0[u,u,u,u] 893; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] 894; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 895; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 896; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 897; AVX-NEXT: vmovdqa %xmm0, 16(%rax) 898; AVX-NEXT: vmovdqa %xmm6, (%rax) 899; AVX-NEXT: vmovq %xmm5, 48(%rax) 900; AVX-NEXT: vmovdqa %xmm4, 32(%rax) 901; AVX-NEXT: retq 902; 903; AVX2-LABEL: store_i8_stride7_vf8: 904; AVX2: # %bb.0: 905; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 906; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 907; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 908; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 909; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 910; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 911; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 912; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 913; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 914; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 915; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 916; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 917; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 918; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 919; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] 920; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 921; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] 922; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 923; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm3 924; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] 925; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 926; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero 927; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 928; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] 929; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 930; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] 931; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero 932; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2 933; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero 934; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero 935; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 936; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 937; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 938; AVX2-NEXT: vmovdqa %ymm0, (%rax) 939; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 940; AVX2-NEXT: vmovq %xmm0, 48(%rax) 941; AVX2-NEXT: vmovdqa %xmm3, 32(%rax) 942; AVX2-NEXT: vzeroupper 943; AVX2-NEXT: retq 944; 945; AVX2-FP-LABEL: store_i8_stride7_vf8: 946; AVX2-FP: # %bb.0: 947; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 948; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 949; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 950; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 951; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 952; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 953; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 954; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 955; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 956; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 957; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 958; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 959; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 960; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 961; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] 962; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 963; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] 964; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 965; AVX2-FP-NEXT: vpor %ymm5, %ymm3, %ymm3 966; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] 967; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 968; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero 969; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5 970; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] 971; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 972; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] 973; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero 974; AVX2-FP-NEXT: vpor %ymm4, %ymm2, %ymm2 975; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero 976; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero 977; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 978; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 979; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 980; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) 981; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0 982; AVX2-FP-NEXT: vmovq %xmm0, 48(%rax) 983; AVX2-FP-NEXT: vmovdqa %xmm3, 32(%rax) 984; AVX2-FP-NEXT: vzeroupper 985; AVX2-FP-NEXT: retq 986; 987; AVX2-FCP-LABEL: store_i8_stride7_vf8: 988; AVX2-FCP: # %bb.0: 989; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 990; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 991; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 992; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 993; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 994; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 995; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 996; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 997; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 998; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 999; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1000; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1001; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1002; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 1003; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1004; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] 1005; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 1006; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 1007; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1008; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] 1009; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm3 1010; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero 1011; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 1012; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] 1013; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 1014; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero 1015; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 1016; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] 1017; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1018; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 1019; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] 1020; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 1021; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 1022; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) 1023; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 1024; AVX2-FCP-NEXT: vmovq %xmm0, 48(%rax) 1025; AVX2-FCP-NEXT: vmovdqa %xmm1, 32(%rax) 1026; AVX2-FCP-NEXT: vzeroupper 1027; AVX2-FCP-NEXT: retq 1028; 1029; AVX512-LABEL: store_i8_stride7_vf8: 1030; AVX512: # %bb.0: 1031; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1032; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1033; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1034; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1035; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1036; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1037; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1038; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1039; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1040; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1041; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1042; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1043; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1044; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 1045; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] 1046; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] 1047; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u] 1048; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1049; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 1050; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero 1051; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero 1052; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 1053; AVX512-NEXT: vporq %zmm2, %zmm3, %zmm2 1054; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] 1055; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] 1056; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u] 1057; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 1058; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] 1059; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] 1060; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1061; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 1062; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1063; AVX512-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 1064; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 1065; AVX512-NEXT: vmovq %xmm1, 48(%rax) 1066; AVX512-NEXT: vmovdqa %ymm0, (%rax) 1067; AVX512-NEXT: vzeroupper 1068; AVX512-NEXT: retq 1069; 1070; AVX512-FCP-LABEL: store_i8_stride7_vf8: 1071; AVX512-FCP: # %bb.0: 1072; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1073; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1074; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1075; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1076; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1077; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1078; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1079; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1080; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1081; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1082; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1083; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1084; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1085; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] 1086; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 1087; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] 1088; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] 1089; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 1090; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 1091; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] 1092; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 1093; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero 1094; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1095; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] 1096; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 1097; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero 1098; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1) 1099; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] 1100; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] 1101; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 1102; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] 1103; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1104; AVX512-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 1105; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 1106; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 1107; AVX512-FCP-NEXT: vmovq %xmm1, 48(%rax) 1108; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 1109; AVX512-FCP-NEXT: vzeroupper 1110; AVX512-FCP-NEXT: retq 1111; 1112; AVX512DQ-LABEL: store_i8_stride7_vf8: 1113; AVX512DQ: # %bb.0: 1114; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1115; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1116; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1117; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1118; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1119; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1120; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1121; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1122; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1123; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1124; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1125; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1126; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1127; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 1128; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] 1129; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] 1130; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u] 1131; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1132; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 1133; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero 1134; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero 1135; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 1136; AVX512DQ-NEXT: vporq %zmm2, %zmm3, %zmm2 1137; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] 1138; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] 1139; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u] 1140; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 1141; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] 1142; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] 1143; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1144; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 1145; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 1146; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 1147; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 1148; AVX512DQ-NEXT: vmovq %xmm1, 48(%rax) 1149; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 1150; AVX512DQ-NEXT: vzeroupper 1151; AVX512DQ-NEXT: retq 1152; 1153; AVX512DQ-FCP-LABEL: store_i8_stride7_vf8: 1154; AVX512DQ-FCP: # %bb.0: 1155; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1156; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1157; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1158; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1159; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1160; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1161; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1162; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1163; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1164; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1165; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1166; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1167; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1168; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] 1169; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 1170; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] 1171; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] 1172; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 1173; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero 1174; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] 1175; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 1176; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero 1177; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1178; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] 1179; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 1180; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero 1181; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1) 1182; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] 1183; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] 1184; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 1185; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] 1186; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1187; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 1188; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) 1189; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 1190; AVX512DQ-FCP-NEXT: vmovq %xmm1, 48(%rax) 1191; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 1192; AVX512DQ-FCP-NEXT: vzeroupper 1193; AVX512DQ-FCP-NEXT: retq 1194; 1195; AVX512BW-LABEL: store_i8_stride7_vf8: 1196; AVX512BW: # %bb.0: 1197; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1198; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1199; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1200; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1201; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1202; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1203; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1204; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1205; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1206; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1207; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1208; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1209; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1210; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 1211; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1212; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] 1213; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero 1214; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] 1215; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero 1216; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 1217; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 1218; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1219; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] 1220; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1221; AVX512BW-NEXT: vporq %zmm2, %zmm1, %zmm1 1222; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 1223; AVX512BW-NEXT: kmovq %rcx, %k1 1224; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} 1225; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) 1226; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 1227; AVX512BW-NEXT: vmovq %xmm0, 48(%rax) 1228; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) 1229; AVX512BW-NEXT: vzeroupper 1230; AVX512BW-NEXT: retq 1231; 1232; AVX512BW-FCP-LABEL: store_i8_stride7_vf8: 1233; AVX512BW-FCP: # %bb.0: 1234; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1235; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1236; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1237; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1238; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1239; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1240; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1241; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1242; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1243; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1244; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1245; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1246; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1247; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] 1248; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 1249; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] 1250; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 1251; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero 1252; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 1253; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] 1254; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1255; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 1256; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 1257; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 1258; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] 1259; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] 1260; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1261; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 1262; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1263; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] 1264; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 1265; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero 1266; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 1267; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 1268; AVX512BW-FCP-NEXT: vmovq %xmm2, 48(%rax) 1269; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) 1270; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 1271; AVX512BW-FCP-NEXT: vzeroupper 1272; AVX512BW-FCP-NEXT: retq 1273; 1274; AVX512DQ-BW-LABEL: store_i8_stride7_vf8: 1275; AVX512DQ-BW: # %bb.0: 1276; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1277; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1278; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1279; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1280; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1281; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1282; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1283; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1284; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1285; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1286; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1287; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1288; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1289; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] 1290; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1291; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] 1292; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero 1293; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] 1294; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero 1295; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 1296; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 1297; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1298; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] 1299; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1300; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm1, %zmm1 1301; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 1302; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 1303; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} 1304; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) 1305; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 1306; AVX512DQ-BW-NEXT: vmovq %xmm0, 48(%rax) 1307; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax) 1308; AVX512DQ-BW-NEXT: vzeroupper 1309; AVX512DQ-BW-NEXT: retq 1310; 1311; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf8: 1312; AVX512DQ-BW-FCP: # %bb.0: 1313; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1314; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1315; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1316; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1317; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1318; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1319; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1320; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1321; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 1322; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] 1323; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1324; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 1325; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1326; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] 1327; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 1328; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] 1329; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 1330; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero 1331; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 1332; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] 1333; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1334; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 1335; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 1336; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 1337; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] 1338; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] 1339; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] 1340; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 1341; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1342; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] 1343; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 1344; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero 1345; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 1346; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 1347; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, 48(%rax) 1348; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) 1349; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 1350; AVX512DQ-BW-FCP-NEXT: vzeroupper 1351; AVX512DQ-BW-FCP-NEXT: retq 1352 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 1353 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64 1354 %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64 1355 %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64 1356 %in.vec4 = load <8 x i8>, ptr %in.vecptr4, align 64 1357 %in.vec5 = load <8 x i8>, ptr %in.vecptr5, align 64 1358 %in.vec6 = load <8 x i8>, ptr %in.vecptr6, align 64 1359 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1360 %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1361 %3 = shufflevector <8 x i8> %in.vec4, <8 x i8> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1362 %4 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1363 %5 = shufflevector <8 x i8> %in.vec6, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1364 %6 = shufflevector <16 x i8> %3, <16 x i8> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 1365 %7 = shufflevector <24 x i8> %6, <24 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1366 %8 = shufflevector <32 x i8> %4, <32 x i8> %7, <56 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55> 1367 %interleaved.vec = shufflevector <56 x i8> %8, <56 x i8> poison, <56 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55> 1368 store <56 x i8> %interleaved.vec, ptr %out.vec, align 64 1369 ret void 1370} 1371 1372define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { 1373; SSE-LABEL: store_i8_stride7_vf16: 1374; SSE: # %bb.0: 1375; SSE-NEXT: subq $56, %rsp 1376; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1377; SSE-NEXT: movdqa (%rdi), %xmm12 1378; SSE-NEXT: movdqa (%rsi), %xmm4 1379; SSE-NEXT: movdqa (%rdx), %xmm0 1380; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1381; SSE-NEXT: movdqa (%rcx), %xmm5 1382; SSE-NEXT: movdqa (%r8), %xmm7 1383; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1384; SSE-NEXT: movdqa (%r9), %xmm8 1385; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 1386; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1387; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 1388; SSE-NEXT: pand %xmm13, %xmm0 1389; SSE-NEXT: movdqa %xmm5, %xmm1 1390; SSE-NEXT: movdqa %xmm5, %xmm6 1391; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1392; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 1393; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1394; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] 1395; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1396; SSE-NEXT: movdqa %xmm13, %xmm2 1397; SSE-NEXT: pandn %xmm1, %xmm2 1398; SSE-NEXT: por %xmm0, %xmm2 1399; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 1400; SSE-NEXT: movdqa %xmm0, %xmm1 1401; SSE-NEXT: pandn %xmm2, %xmm1 1402; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7] 1403; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] 1404; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 1405; SSE-NEXT: pand %xmm10, %xmm3 1406; SSE-NEXT: movdqa %xmm4, %xmm9 1407; SSE-NEXT: movdqa %xmm4, %xmm5 1408; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1409; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] 1410; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,6,6,7] 1411; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill 1412; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] 1413; SSE-NEXT: pandn %xmm4, %xmm10 1414; SSE-NEXT: por %xmm3, %xmm10 1415; SSE-NEXT: pand %xmm0, %xmm10 1416; SSE-NEXT: por %xmm1, %xmm10 1417; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7] 1418; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 1419; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] 1420; SSE-NEXT: movdqa %xmm2, %xmm4 1421; SSE-NEXT: pandn %xmm1, %xmm4 1422; SSE-NEXT: movdqa %xmm8, %xmm1 1423; SSE-NEXT: movdqa %xmm8, %xmm3 1424; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1425; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 1426; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1427; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] 1428; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] 1429; SSE-NEXT: pand %xmm2, %xmm1 1430; SSE-NEXT: por %xmm4, %xmm1 1431; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] 1432; SSE-NEXT: pand %xmm4, %xmm10 1433; SSE-NEXT: pandn %xmm1, %xmm4 1434; SSE-NEXT: movdqa (%rax), %xmm15 1435; SSE-NEXT: por %xmm10, %xmm4 1436; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,6,7,7,7] 1437; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] 1438; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 1439; SSE-NEXT: movdqa %xmm11, %xmm7 1440; SSE-NEXT: pandn %xmm1, %xmm7 1441; SSE-NEXT: pand %xmm11, %xmm4 1442; SSE-NEXT: por %xmm4, %xmm7 1443; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1444; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3] 1445; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1446; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] 1447; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 1448; SSE-NEXT: movdqa %xmm10, %xmm4 1449; SSE-NEXT: pandn %xmm1, %xmm4 1450; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] 1451; SSE-NEXT: movdqa %xmm12, %xmm5 1452; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 1453; SSE-NEXT: pand %xmm10, %xmm1 1454; SSE-NEXT: por %xmm1, %xmm4 1455; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] 1456; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1457; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] 1458; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] 1459; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 1460; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 1461; SSE-NEXT: movdqa %xmm12, %xmm14 1462; SSE-NEXT: pandn %xmm1, %xmm14 1463; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1464; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] 1465; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 1466; SSE-NEXT: pand %xmm12, %xmm1 1467; SSE-NEXT: por %xmm1, %xmm14 1468; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 1469; SSE-NEXT: pand %xmm1, %xmm4 1470; SSE-NEXT: pandn %xmm14, %xmm1 1471; SSE-NEXT: por %xmm4, %xmm1 1472; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] 1473; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1474; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] 1475; SSE-NEXT: movdqa %xmm2, %xmm14 1476; SSE-NEXT: pandn %xmm4, %xmm14 1477; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1478; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] 1479; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 1480; SSE-NEXT: pand %xmm2, %xmm4 1481; SSE-NEXT: por %xmm4, %xmm14 1482; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] 1483; SSE-NEXT: pand %xmm4, %xmm14 1484; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[3,3,3,3,4,5,6,7] 1485; SSE-NEXT: movdqa %xmm15, %xmm3 1486; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] 1487; SSE-NEXT: pandn %xmm7, %xmm4 1488; SSE-NEXT: por %xmm14, %xmm4 1489; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 1490; SSE-NEXT: pand %xmm7, %xmm1 1491; SSE-NEXT: pandn %xmm4, %xmm7 1492; SSE-NEXT: por %xmm1, %xmm7 1493; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1494; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,7] 1495; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1496; SSE-NEXT: movdqa %xmm13, %xmm4 1497; SSE-NEXT: pandn %xmm1, %xmm4 1498; SSE-NEXT: movdqa %xmm5, %xmm15 1499; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] 1500; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 1501; SSE-NEXT: pand %xmm13, %xmm1 1502; SSE-NEXT: por %xmm1, %xmm4 1503; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] 1504; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 1505; SSE-NEXT: movdqa %xmm2, %xmm7 1506; SSE-NEXT: pandn %xmm1, %xmm7 1507; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1508; SSE-NEXT: # xmm1 = mem[2,1,2,3] 1509; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 1510; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 1511; SSE-NEXT: pand %xmm2, %xmm1 1512; SSE-NEXT: por %xmm7, %xmm1 1513; SSE-NEXT: pand %xmm0, %xmm1 1514; SSE-NEXT: pandn %xmm4, %xmm0 1515; SSE-NEXT: por %xmm1, %xmm0 1516; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1517; SSE-NEXT: # xmm1 = mem[1,1,2,3] 1518; SSE-NEXT: movdqa %xmm10, %xmm4 1519; SSE-NEXT: pandn %xmm1, %xmm4 1520; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] 1521; SSE-NEXT: movdqa %xmm6, %xmm9 1522; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] 1523; SSE-NEXT: pand %xmm10, %xmm1 1524; SSE-NEXT: por %xmm4, %xmm1 1525; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6] 1526; SSE-NEXT: movdqa %xmm3, %xmm6 1527; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] 1528; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 1529; SSE-NEXT: movdqa %xmm5, %xmm7 1530; SSE-NEXT: pandn %xmm4, %xmm7 1531; SSE-NEXT: pand %xmm5, %xmm1 1532; SSE-NEXT: por %xmm1, %xmm7 1533; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 1534; SSE-NEXT: pand %xmm1, %xmm0 1535; SSE-NEXT: pandn %xmm7, %xmm1 1536; SSE-NEXT: por %xmm0, %xmm1 1537; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1538; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1539; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1540; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1541; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1542; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1543; SSE-NEXT: movdqa %xmm5, %xmm1 1544; SSE-NEXT: pandn %xmm0, %xmm1 1545; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7] 1546; SSE-NEXT: movdqa %xmm8, %xmm3 1547; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1548; SSE-NEXT: pand %xmm5, %xmm0 1549; SSE-NEXT: por %xmm0, %xmm1 1550; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1551; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1552; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1553; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] 1554; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1555; SSE-NEXT: movdqa %xmm11, %xmm7 1556; SSE-NEXT: pandn %xmm0, %xmm7 1557; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] 1558; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1559; SSE-NEXT: pand %xmm11, %xmm0 1560; SSE-NEXT: por %xmm0, %xmm7 1561; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 1562; SSE-NEXT: pand %xmm0, %xmm7 1563; SSE-NEXT: pandn %xmm1, %xmm0 1564; SSE-NEXT: por %xmm7, %xmm0 1565; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1566; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1567; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1568; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1569; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 1570; SSE-NEXT: movdqa %xmm13, %xmm7 1571; SSE-NEXT: pandn %xmm1, %xmm7 1572; SSE-NEXT: movdqa %xmm9, %xmm5 1573; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] 1574; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1575; SSE-NEXT: pand %xmm13, %xmm1 1576; SSE-NEXT: por %xmm1, %xmm7 1577; SSE-NEXT: movdqa %xmm6, %xmm8 1578; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] 1579; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1580; SSE-NEXT: movdqa %xmm2, %xmm9 1581; SSE-NEXT: pandn %xmm1, %xmm9 1582; SSE-NEXT: pand %xmm2, %xmm7 1583; SSE-NEXT: por %xmm7, %xmm9 1584; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 1585; SSE-NEXT: pand %xmm14, %xmm0 1586; SSE-NEXT: pandn %xmm9, %xmm14 1587; SSE-NEXT: por %xmm0, %xmm14 1588; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1589; SSE-NEXT: # xmm0 = mem[0,1,1,3] 1590; SSE-NEXT: movdqa %xmm10, %xmm7 1591; SSE-NEXT: pandn %xmm0, %xmm7 1592; SSE-NEXT: movdqa %xmm3, %xmm1 1593; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7] 1594; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,3,2] 1595; SSE-NEXT: pand %xmm10, %xmm9 1596; SSE-NEXT: por %xmm7, %xmm9 1597; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5] 1598; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1599; SSE-NEXT: movdqa %xmm2, %xmm7 1600; SSE-NEXT: pandn %xmm0, %xmm7 1601; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload 1602; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] 1603; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,1] 1604; SSE-NEXT: pand %xmm2, %xmm6 1605; SSE-NEXT: por %xmm7, %xmm6 1606; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 1607; SSE-NEXT: pand %xmm0, %xmm6 1608; SSE-NEXT: pandn %xmm9, %xmm0 1609; SSE-NEXT: por %xmm6, %xmm0 1610; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 1611; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] 1612; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 1613; SSE-NEXT: movdqa %xmm12, %xmm6 1614; SSE-NEXT: pandn %xmm3, %xmm6 1615; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] 1616; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 1617; SSE-NEXT: pand %xmm12, %xmm3 1618; SSE-NEXT: por %xmm3, %xmm6 1619; SSE-NEXT: pand %xmm13, %xmm6 1620; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] 1621; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1622; SSE-NEXT: pandn %xmm3, %xmm13 1623; SSE-NEXT: por %xmm6, %xmm13 1624; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 1625; SSE-NEXT: pand %xmm3, %xmm0 1626; SSE-NEXT: pandn %xmm13, %xmm3 1627; SSE-NEXT: por %xmm0, %xmm3 1628; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1629; SSE-NEXT: # xmm0 = mem[2,2,3,3] 1630; SSE-NEXT: movdqa %xmm2, %xmm6 1631; SSE-NEXT: pandn %xmm0, %xmm6 1632; SSE-NEXT: movdqa %xmm1, %xmm13 1633; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] 1634; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1635; SSE-NEXT: pand %xmm2, %xmm0 1636; SSE-NEXT: por %xmm6, %xmm0 1637; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 1638; SSE-NEXT: movdqa %xmm6, %xmm7 1639; SSE-NEXT: pandn %xmm0, %xmm7 1640; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1641; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] 1642; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] 1643; SSE-NEXT: movdqa %xmm12, %xmm0 1644; SSE-NEXT: pandn %xmm9, %xmm0 1645; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] 1646; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] 1647; SSE-NEXT: pand %xmm12, %xmm9 1648; SSE-NEXT: por %xmm9, %xmm0 1649; SSE-NEXT: pand %xmm6, %xmm0 1650; SSE-NEXT: por %xmm7, %xmm0 1651; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1652; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] 1653; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] 1654; SSE-NEXT: movdqa %xmm11, %xmm9 1655; SSE-NEXT: pandn %xmm7, %xmm9 1656; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 1657; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,2,2,3,4,5,6,7] 1658; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 1659; SSE-NEXT: pand %xmm11, %xmm7 1660; SSE-NEXT: por %xmm7, %xmm9 1661; SSE-NEXT: pand %xmm10, %xmm9 1662; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] 1663; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] 1664; SSE-NEXT: pandn %xmm7, %xmm10 1665; SSE-NEXT: por %xmm9, %xmm10 1666; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 1667; SSE-NEXT: pand %xmm7, %xmm10 1668; SSE-NEXT: pandn %xmm0, %xmm7 1669; SSE-NEXT: por %xmm7, %xmm10 1670; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 1671; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[1,1,2,3,4,5,6,7] 1672; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] 1673; SSE-NEXT: pand %xmm2, %xmm7 1674; SSE-NEXT: pandn %xmm0, %xmm2 1675; SSE-NEXT: por %xmm7, %xmm2 1676; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7] 1677; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 1678; SSE-NEXT: pand %xmm11, %xmm0 1679; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 1680; SSE-NEXT: # xmm7 = mem[1,1,2,1] 1681; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] 1682; SSE-NEXT: pandn %xmm7, %xmm11 1683; SSE-NEXT: por %xmm0, %xmm11 1684; SSE-NEXT: pand %xmm6, %xmm11 1685; SSE-NEXT: pandn %xmm2, %xmm6 1686; SSE-NEXT: por %xmm11, %xmm6 1687; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7] 1688; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1689; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 1690; SSE-NEXT: pand %xmm1, %xmm0 1691; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] 1692; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 1693; SSE-NEXT: pandn %xmm2, %xmm1 1694; SSE-NEXT: por %xmm0, %xmm1 1695; SSE-NEXT: pand %xmm12, %xmm1 1696; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] 1697; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1698; SSE-NEXT: pandn %xmm0, %xmm12 1699; SSE-NEXT: por %xmm1, %xmm12 1700; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 1701; SSE-NEXT: pand %xmm0, %xmm6 1702; SSE-NEXT: pandn %xmm12, %xmm0 1703; SSE-NEXT: por %xmm6, %xmm0 1704; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1705; SSE-NEXT: movdqa %xmm0, 16(%rax) 1706; SSE-NEXT: movdqa %xmm10, 32(%rax) 1707; SSE-NEXT: movdqa %xmm3, 64(%rax) 1708; SSE-NEXT: movdqa %xmm14, (%rax) 1709; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1710; SSE-NEXT: movaps %xmm0, 80(%rax) 1711; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1712; SSE-NEXT: movaps %xmm0, 48(%rax) 1713; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1714; SSE-NEXT: movaps %xmm0, 96(%rax) 1715; SSE-NEXT: addq $56, %rsp 1716; SSE-NEXT: retq 1717; 1718; AVX-LABEL: store_i8_stride7_vf16: 1719; AVX: # %bb.0: 1720; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1721; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 1722; AVX-NEXT: vmovdqa (%rdi), %xmm3 1723; AVX-NEXT: vmovdqa (%rsi), %xmm5 1724; AVX-NEXT: vmovdqa (%rdx), %xmm6 1725; AVX-NEXT: vmovdqa (%rcx), %xmm7 1726; AVX-NEXT: vmovdqa (%r8), %xmm0 1727; AVX-NEXT: vmovdqa (%r9), %xmm2 1728; AVX-NEXT: vmovdqa (%r10), %xmm1 1729; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1730; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9],zero,zero 1731; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 1732; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4,5],zero,zero,zero,zero,zero,xmm11[6,7],zero,zero,zero,zero,zero,xmm11[8,9] 1733; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4 1734; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm11[0,1],zero,zero,zero,zero,zero,xmm11[2,3],zero,zero,zero,zero,zero 1735; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero,xmm10[4,5] 1736; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 1737; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 1738; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1739; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,4,5],zero,xmm9[u,u,u,u,6,7],zero,xmm9[u,u,u,u] 1740; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u] 1741; AVX-NEXT: vpor %xmm12, %xmm8, %xmm8 1742; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] 1743; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1744; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] 1745; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 1746; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 1747; AVX-NEXT: vandnps %ymm8, %ymm12, %ymm8 1748; AVX-NEXT: vorps %ymm4, %ymm8, %ymm4 1749; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero 1750; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] 1751; AVX-NEXT: vpor %xmm8, %xmm12, %xmm12 1752; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u] 1753; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u] 1754; AVX-NEXT: vpor %xmm8, %xmm13, %xmm13 1755; AVX-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 1756; AVX-NEXT: vpblendvb %xmm8, %xmm12, %xmm13, %xmm12 1757; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm11[10,11],zero,zero,zero,zero,zero,xmm11[12,13],zero,zero 1758; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[10,11],zero,zero,zero,zero,zero,xmm10[12,13],zero,zero,zero,zero 1759; AVX-NEXT: vpor %xmm11, %xmm10, %xmm10 1760; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 1761; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 1762; AVX-NEXT: vandps %ymm11, %ymm10, %ymm10 1763; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u] 1764; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u] 1765; AVX-NEXT: vpor %xmm12, %xmm13, %xmm12 1766; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm12[u,u,u,u,5,6],zero,xmm12[u,u,u,u,12,13],zero,xmm12[u] 1767; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[6,u,u,u,u],zero,zero,xmm1[7,u,u,u,u],zero,zero,xmm1[8,u] 1768; AVX-NEXT: vpor %xmm13, %xmm12, %xmm12 1769; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9],zero,xmm9[u,u,u,u,10,11],zero,xmm9[u,u,u,u,12,13] 1770; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero 1771; AVX-NEXT: vpor %xmm13, %xmm9, %xmm9 1772; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm9 1773; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 1774; AVX-NEXT: vorps %ymm9, %ymm10, %ymm9 1775; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 1776; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm10[8,9],zero,zero,zero,zero,zero,xmm10[10,11],zero,zero,zero 1777; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 1778; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6],zero,zero,zero,zero,zero,xmm12[9,8],zero,zero,zero,zero,zero,xmm12[11,10],zero 1779; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 1780; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 1781; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[2,3],zero,zero,zero,zero,zero,xmm6[4,5],zero,zero,zero,zero,zero,xmm6[6] 1782; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 1783; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2],zero,zero,zero,zero,zero,xmm3[5,4],zero,zero,zero,zero,zero,xmm3[7,6],zero 1784; AVX-NEXT: vpor %xmm7, %xmm3, %xmm3 1785; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 1786; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1787; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] 1788; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u],zero,zero,xmm1[11,u,u,u,u],zero,zero,xmm1[12,u,u,u,u],zero 1789; AVX-NEXT: vpor %xmm7, %xmm11, %xmm7 1790; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] 1791; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u],zero,zero,xmm1[9,u,u,u,u],zero,zero,xmm1[10,u,u,u] 1792; AVX-NEXT: vpor %xmm5, %xmm11, %xmm5 1793; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 1794; AVX-NEXT: vandnps %ymm5, %ymm8, %ymm5 1795; AVX-NEXT: vorps %ymm5, %ymm3, %ymm3 1796; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm6[12,13],zero,zero,zero,zero,zero,xmm6[14,15],zero,zero,zero 1797; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm10[12,13],zero,zero,zero,zero,zero,xmm10[14,15],zero,zero,zero,zero,zero 1798; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 1799; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 1800; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero 1801; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] 1802; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1803; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 1804; AVX-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 1805; AVX-NEXT: vmovdqa %xmm0, 96(%rax) 1806; AVX-NEXT: vmovaps %ymm3, 64(%rax) 1807; AVX-NEXT: vmovaps %ymm9, 32(%rax) 1808; AVX-NEXT: vmovaps %ymm4, (%rax) 1809; AVX-NEXT: vzeroupper 1810; AVX-NEXT: retq 1811; 1812; AVX2-LABEL: store_i8_stride7_vf16: 1813; AVX2: # %bb.0: 1814; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1815; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1816; AVX2-NEXT: vmovdqa (%rdi), %xmm3 1817; AVX2-NEXT: vmovdqa (%rdx), %xmm4 1818; AVX2-NEXT: vmovdqa (%r8), %xmm1 1819; AVX2-NEXT: vmovdqa (%r9), %xmm2 1820; AVX2-NEXT: vmovdqa (%r10), %xmm0 1821; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 1822; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 1823; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 1824; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero,zero,zero 1825; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] 1826; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] 1827; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 1828; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero 1829; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] 1830; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero 1831; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 1832; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 1833; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 1834; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero 1835; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] 1836; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero 1837; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 1838; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 1839; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] 1840; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 1841; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1842; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 1843; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 1844; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] 1845; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] 1846; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] 1847; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 1848; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] 1849; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 1850; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1851; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,2,0,2] 1852; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] 1853; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,0,2] 1854; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero 1855; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 1856; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 1857; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 1858; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] 1859; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 1860; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] 1861; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] 1862; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 1863; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 1864; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 1865; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] 1866; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero 1867; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] 1868; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero 1869; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 1870; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 1871; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 1872; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] 1873; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero 1874; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 1875; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero 1876; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 1877; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1878; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] 1879; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7] 1880; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 1881; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] 1882; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1883; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) 1884; AVX2-NEXT: vmovdqa %ymm7, (%rax) 1885; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 1886; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 1887; AVX2-NEXT: vmovdqa %xmm0, 96(%rax) 1888; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) 1889; AVX2-NEXT: vzeroupper 1890; AVX2-NEXT: retq 1891; 1892; AVX2-FP-LABEL: store_i8_stride7_vf16: 1893; AVX2-FP: # %bb.0: 1894; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1895; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1896; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 1897; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 1898; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 1899; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 1900; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 1901; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 1902; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 1903; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero 1904; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] 1905; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] 1906; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 1907; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero 1908; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] 1909; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero 1910; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5 1911; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 1912; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 1913; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero 1914; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 1915; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero 1916; AVX2-FP-NEXT: vpor %ymm7, %ymm4, %ymm7 1917; AVX2-FP-NEXT: vmovdqa (%r10), %xmm4 1918; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 1919; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] 1920; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 1921; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 1922; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 1923; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 1924; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] 1925; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] 1926; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] 1927; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] 1928; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 1929; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 1930; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] 1931; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] 1932; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] 1933; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero 1934; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 1935; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 1936; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 1937; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] 1938; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 1939; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] 1940; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 1941; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 1942; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 1943; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3] 1944; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero 1945; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1] 1946; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero 1947; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 1948; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 1949; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 1950; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 1951; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero 1952; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] 1953; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero 1954; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 1955; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1956; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero 1957; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm4[13,14,15,4,5],zero,zero,xmm4[14,15,14,15,12],zero,zero,xmm4[15] 1958; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 1959; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) 1960; AVX2-FP-NEXT: vmovdqa %ymm7, (%rax) 1961; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 1962; AVX2-FP-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1963; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax) 1964; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) 1965; AVX2-FP-NEXT: vzeroupper 1966; AVX2-FP-NEXT: retq 1967; 1968; AVX2-FCP-LABEL: store_i8_stride7_vf16: 1969; AVX2-FCP: # %bb.0: 1970; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1971; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1972; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 1973; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4 1974; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 1975; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm2 1976; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm0 1977; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 1978; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 1979; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 1980; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] 1981; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1] 1982; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5 1983; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] 1984; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] 1985; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 1986; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 1987; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] 1988; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] 1989; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] 1990; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero 1991; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 1992; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 1993; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 1994; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 1995; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] 1996; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,5,2,6,1,5,2,6] 1997; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 1998; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm9 1999; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] 2000; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 2001; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm7, %ymm7 2002; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 2003; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm9[1,5],zero,zero,zero,zero,zero,ymm9[2,6],zero,zero,zero,zero,zero,ymm9[19,23],zero,zero,zero,zero,zero,ymm9[24,28],zero,zero,zero,zero 2004; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm8 2005; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm8[1,5],zero,zero,zero,zero,zero,ymm8[2,6],zero,zero,zero,zero,zero,ymm8[19,23],zero,zero,zero,zero,zero,ymm8[24,28],zero,zero,zero,zero,zero,ymm8[25] 2006; AVX2-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 2007; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 2008; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 2009; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] 2010; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] 2011; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] 2012; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 2013; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] 2014; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 2015; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 2016; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 2017; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] 2018; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero 2019; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] 2020; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero 2021; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 2022; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 2023; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 2024; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] 2025; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero 2026; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 2027; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero 2028; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 2029; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2030; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero 2031; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,14,15,4,5],zero,zero,xmm0[14,15,14,15,12],zero,zero,xmm0[15] 2032; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 2033; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax) 2034; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rax) 2035; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax) 2036; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 2037; AVX2-FCP-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 2038; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rax) 2039; AVX2-FCP-NEXT: vzeroupper 2040; AVX2-FCP-NEXT: retq 2041; 2042; AVX512-LABEL: store_i8_stride7_vf16: 2043; AVX512: # %bb.0: 2044; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 2045; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 2046; AVX512-NEXT: vmovdqa (%rdi), %xmm3 2047; AVX512-NEXT: vmovdqa (%rdx), %xmm4 2048; AVX512-NEXT: vmovdqa (%r8), %xmm1 2049; AVX512-NEXT: vmovdqa (%r9), %xmm2 2050; AVX512-NEXT: vmovdqa (%r10), %xmm0 2051; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 2052; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 2053; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 2054; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] 2055; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 2056; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] 2057; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 2058; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) 2059; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] 2060; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] 2061; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 2062; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero 2063; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] 2064; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] 2065; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) 2066; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] 2067; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero 2068; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 2069; AVX512-NEXT: vporq %zmm6, %zmm7, %zmm6 2070; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] 2071; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 2072; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] 2073; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] 2074; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7) 2075; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] 2076; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] 2077; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 2078; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 2079; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] 2080; AVX512-NEXT: vpandn %ymm8, %ymm9, %ymm8 2081; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] 2082; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] 2083; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] 2084; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 2085; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 2086; AVX512-NEXT: vporq %zmm8, %zmm7, %zmm7 2087; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) 2088; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] 2089; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] 2090; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] 2091; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] 2092; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6 2093; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] 2094; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] 2095; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 2096; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] 2097; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] 2098; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) 2099; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) 2100; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] 2101; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] 2102; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 2103; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] 2104; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 2105; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2106; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero 2107; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] 2108; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 2109; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) 2110; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 2111; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) 2112; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) 2113; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) 2114; AVX512-NEXT: vzeroupper 2115; AVX512-NEXT: retq 2116; 2117; AVX512-FCP-LABEL: store_i8_stride7_vf16: 2118; AVX512-FCP: # %bb.0: 2119; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2120; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2121; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 2122; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 2123; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 2124; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 2125; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 2126; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 2127; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 2128; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 2129; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] 2130; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] 2131; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] 2132; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] 2133; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 2134; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero 2135; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 2136; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] 2137; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero 2138; AVX512-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 2139; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] 2140; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 2141; AVX512-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 2142; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] 2143; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0] 2144; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 2145; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 2146; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] 2147; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 2148; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2] 2149; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] 2150; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 2151; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] 2152; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 2153; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem) 2154; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) 2155; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] 2156; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] 2157; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] 2158; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] 2159; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 2160; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] 2161; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] 2162; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] 2163; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 2164; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] 2165; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] 2166; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) 2167; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) 2168; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] 2169; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] 2170; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 2171; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] 2172; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 2173; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2174; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero 2175; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] 2176; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 2177; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) 2178; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 2179; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) 2180; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 2181; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) 2182; AVX512-FCP-NEXT: vzeroupper 2183; AVX512-FCP-NEXT: retq 2184; 2185; AVX512DQ-LABEL: store_i8_stride7_vf16: 2186; AVX512DQ: # %bb.0: 2187; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 2188; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 2189; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3 2190; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 2191; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 2192; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 2193; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 2194; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 2195; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 2196; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 2197; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] 2198; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] 2199; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] 2200; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 2201; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) 2202; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] 2203; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] 2204; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 2205; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero 2206; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] 2207; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] 2208; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) 2209; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] 2210; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero 2211; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 2212; AVX512DQ-NEXT: vporq %zmm6, %zmm7, %zmm6 2213; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] 2214; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] 2215; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] 2216; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] 2217; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7) 2218; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] 2219; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] 2220; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 2221; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 2222; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] 2223; AVX512DQ-NEXT: vpandn %ymm8, %ymm9, %ymm8 2224; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] 2225; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] 2226; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] 2227; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 2228; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 2229; AVX512DQ-NEXT: vporq %zmm8, %zmm7, %zmm7 2230; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) 2231; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] 2232; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] 2233; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] 2234; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] 2235; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6 2236; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] 2237; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] 2238; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 2239; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] 2240; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] 2241; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) 2242; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) 2243; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] 2244; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] 2245; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 2246; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] 2247; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 2248; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2249; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero 2250; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] 2251; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 2252; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) 2253; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 2254; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) 2255; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) 2256; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) 2257; AVX512DQ-NEXT: vzeroupper 2258; AVX512DQ-NEXT: retq 2259; 2260; AVX512DQ-FCP-LABEL: store_i8_stride7_vf16: 2261; AVX512DQ-FCP: # %bb.0: 2262; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2263; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2264; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 2265; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 2266; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 2267; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 2268; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 2269; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 2270; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 2271; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 2272; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] 2273; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] 2274; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] 2275; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] 2276; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 2277; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero 2278; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 2279; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] 2280; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero 2281; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 2282; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] 2283; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 2284; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 2285; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] 2286; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0] 2287; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 2288; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 2289; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] 2290; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 2291; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2] 2292; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] 2293; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 2294; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] 2295; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 2296; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem) 2297; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) 2298; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] 2299; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] 2300; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] 2301; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] 2302; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 2303; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] 2304; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] 2305; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] 2306; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 2307; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] 2308; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] 2309; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) 2310; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) 2311; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] 2312; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] 2313; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] 2314; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] 2315; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 2316; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2317; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero 2318; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] 2319; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 2320; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) 2321; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 2322; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) 2323; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 2324; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) 2325; AVX512DQ-FCP-NEXT: vzeroupper 2326; AVX512DQ-FCP-NEXT: retq 2327; 2328; AVX512BW-LABEL: store_i8_stride7_vf16: 2329; AVX512BW: # %bb.0: 2330; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2331; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 2332; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 2333; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 2334; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 2335; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 2336; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 2337; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2338; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 2339; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 2340; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 2341; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] 2342; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 2343; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7] 2344; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] 2345; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 2346; AVX512BW-NEXT: kmovd %ecx, %k1 2347; AVX512BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} 2348; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] 2349; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero 2350; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] 2351; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero 2352; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 2353; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 2354; AVX512BW-NEXT: kmovd %ecx, %k1 2355; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} 2356; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 2357; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] 2358; AVX512BW-NEXT: vpermw %ymm4, %ymm3, %ymm3 2359; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] 2360; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 2361; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 2362; AVX512BW-NEXT: kmovd %ecx, %k1 2363; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} 2364; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1] 2365; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero 2366; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] 2367; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero 2368; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3 2369; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 2370; AVX512BW-NEXT: kmovd %ecx, %k1 2371; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} 2372; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 2373; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] 2374; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6 2375; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero 2376; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] 2377; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero 2378; AVX512BW-NEXT: vpor %ymm4, %ymm7, %ymm4 2379; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] 2380; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] 2381; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2382; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 2383; AVX512BW-NEXT: kmovq %rcx, %k1 2384; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} 2385; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero 2386; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] 2387; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero 2388; AVX512BW-NEXT: vpor %ymm4, %ymm6, %ymm4 2389; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero 2390; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] 2391; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] 2392; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 2393; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 2394; AVX512BW-NEXT: kmovd %ecx, %k1 2395; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} 2396; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] 2397; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero 2398; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] 2399; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] 2400; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2401; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 2402; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 2403; AVX512BW-NEXT: kmovq %rcx, %k1 2404; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 2405; AVX512BW-NEXT: vmovdqa %xmm5, 96(%rax) 2406; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) 2407; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) 2408; AVX512BW-NEXT: vzeroupper 2409; AVX512BW-NEXT: retq 2410; 2411; AVX512BW-FCP-LABEL: store_i8_stride7_vf16: 2412; AVX512BW-FCP: # %bb.0: 2413; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2414; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2415; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 2416; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 2417; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 2418; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 2419; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 2420; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2421; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 2422; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] 2423; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero 2424; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] 2425; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero 2426; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 2427; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 2428; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 2429; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero 2430; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5 2431; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15] 2432; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 2433; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 2434; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 2435; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} 2436; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 2437; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] 2438; AVX512BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4 2439; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] 2440; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 2441; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 2442; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 2443; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} 2444; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1] 2445; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero 2446; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] 2447; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero 2448; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 2449; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 2450; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 2451; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1} 2452; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 2453; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] 2454; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6 2455; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] 2456; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] 2457; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7 2458; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] 2459; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 2460; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] 2461; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 2462; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 2463; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} 2464; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6 2465; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] 2466; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 2467; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57] 2468; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 2469; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] 2470; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 2471; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero 2472; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0 2473; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 2474; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 2475; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 2476; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax) 2477; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 2478; AVX512BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) 2479; AVX512BW-FCP-NEXT: vzeroupper 2480; AVX512BW-FCP-NEXT: retq 2481; 2482; AVX512DQ-BW-LABEL: store_i8_stride7_vf16: 2483; AVX512DQ-BW: # %bb.0: 2484; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2485; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 2486; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 2487; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 2488; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 2489; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 2490; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 2491; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2492; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 2493; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 2494; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 2495; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] 2496; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 2497; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7] 2498; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] 2499; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 2500; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2501; AVX512DQ-BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} 2502; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] 2503; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero 2504; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] 2505; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero 2506; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 2507; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 2508; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2509; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} 2510; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 2511; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] 2512; AVX512DQ-BW-NEXT: vpermw %ymm4, %ymm3, %ymm3 2513; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] 2514; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 2515; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 2516; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2517; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} 2518; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1] 2519; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero 2520; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] 2521; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero 2522; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3 2523; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 2524; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2525; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} 2526; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 2527; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] 2528; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6 2529; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero 2530; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] 2531; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero 2532; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm7, %ymm4 2533; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] 2534; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] 2535; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 2536; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 2537; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 2538; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} 2539; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero 2540; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] 2541; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero 2542; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm6, %ymm4 2543; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero 2544; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] 2545; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] 2546; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 2547; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 2548; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 2549; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} 2550; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] 2551; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero 2552; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] 2553; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] 2554; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2555; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 2556; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 2557; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 2558; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 2559; AVX512DQ-BW-NEXT: vmovdqa %xmm5, 96(%rax) 2560; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) 2561; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) 2562; AVX512DQ-BW-NEXT: vzeroupper 2563; AVX512DQ-BW-NEXT: retq 2564; 2565; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf16: 2566; AVX512DQ-BW-FCP: # %bb.0: 2567; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2568; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2569; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 2570; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 2571; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 2572; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 2573; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 2574; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 2575; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 2576; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] 2577; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero 2578; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] 2579; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero 2580; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 2581; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 2582; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 2583; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero 2584; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5 2585; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15] 2586; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 2587; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 2588; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 2589; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} 2590; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 2591; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] 2592; AVX512DQ-BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4 2593; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] 2594; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] 2595; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 2596; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 2597; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} 2598; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1] 2599; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero 2600; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] 2601; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero 2602; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 2603; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 2604; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 2605; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1} 2606; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 2607; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] 2608; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6 2609; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] 2610; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] 2611; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7 2612; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] 2613; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 2614; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] 2615; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 2616; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 2617; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} 2618; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6 2619; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] 2620; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 2621; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57] 2622; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 2623; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] 2624; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 2625; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero 2626; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0 2627; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 2628; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 2629; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} 2630; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax) 2631; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 2632; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) 2633; AVX512DQ-BW-FCP-NEXT: vzeroupper 2634; AVX512DQ-BW-FCP-NEXT: retq 2635 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 2636 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 2637 %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64 2638 %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64 2639 %in.vec4 = load <16 x i8>, ptr %in.vecptr4, align 64 2640 %in.vec5 = load <16 x i8>, ptr %in.vecptr5, align 64 2641 %in.vec6 = load <16 x i8>, ptr %in.vecptr6, align 64 2642 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2643 %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2644 %3 = shufflevector <16 x i8> %in.vec4, <16 x i8> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2645 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 2646 %5 = shufflevector <16 x i8> %in.vec6, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2647 %6 = shufflevector <32 x i8> %3, <32 x i8> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 2648 %7 = shufflevector <48 x i8> %6, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2649 %8 = shufflevector <64 x i8> %4, <64 x i8> %7, <112 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111> 2650 %interleaved.vec = shufflevector <112 x i8> %8, <112 x i8> poison, <112 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111> 2651 store <112 x i8> %interleaved.vec, ptr %out.vec, align 64 2652 ret void 2653} 2654 2655define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { 2656; SSE-LABEL: store_i8_stride7_vf32: 2657; SSE: # %bb.0: 2658; SSE-NEXT: subq $360, %rsp # imm = 0x168 2659; SSE-NEXT: movdqa 16(%rdi), %xmm15 2660; SSE-NEXT: movdqa 16(%rsi), %xmm4 2661; SSE-NEXT: movdqa 16(%rdx), %xmm3 2662; SSE-NEXT: movdqa 16(%rcx), %xmm7 2663; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2664; SSE-NEXT: movdqa 16(%r8), %xmm6 2665; SSE-NEXT: movdqa 16(%r9), %xmm5 2666; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2667; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6] 2668; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 2669; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 2670; SSE-NEXT: pand %xmm10, %xmm0 2671; SSE-NEXT: movdqa %xmm4, %xmm8 2672; SSE-NEXT: movdqa %xmm4, %xmm13 2673; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2674; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 2675; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] 2676; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2677; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2678; SSE-NEXT: movdqa %xmm10, %xmm2 2679; SSE-NEXT: pandn %xmm1, %xmm2 2680; SSE-NEXT: por %xmm0, %xmm2 2681; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 2682; SSE-NEXT: movdqa %xmm1, %xmm0 2683; SSE-NEXT: movdqa %xmm1, %xmm11 2684; SSE-NEXT: pandn %xmm2, %xmm0 2685; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] 2686; SSE-NEXT: movdqa %xmm3, %xmm4 2687; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2688; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2689; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] 2690; SSE-NEXT: movdqa %xmm9, %xmm3 2691; SSE-NEXT: pandn %xmm1, %xmm3 2692; SSE-NEXT: movdqa %xmm7, %xmm2 2693; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 2694; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] 2695; SSE-NEXT: movdqa %xmm2, %xmm7 2696; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2697; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 2698; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 2699; SSE-NEXT: pand %xmm9, %xmm1 2700; SSE-NEXT: por %xmm3, %xmm1 2701; SSE-NEXT: pand %xmm11, %xmm1 2702; SSE-NEXT: por %xmm0, %xmm1 2703; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 2704; SSE-NEXT: pand %xmm11, %xmm1 2705; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] 2706; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 2707; SSE-NEXT: movdqa %xmm11, %xmm3 2708; SSE-NEXT: pandn %xmm0, %xmm3 2709; SSE-NEXT: por %xmm1, %xmm3 2710; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 2711; SSE-NEXT: movdqa %xmm5, %xmm1 2712; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2713; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 2714; SSE-NEXT: movdqa %xmm1, %xmm5 2715; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2716; SSE-NEXT: movdqa %xmm12, %xmm1 2717; SSE-NEXT: pandn %xmm0, %xmm1 2718; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2719; SSE-NEXT: pand %xmm12, %xmm3 2720; SSE-NEXT: por %xmm3, %xmm1 2721; SSE-NEXT: movdqa 16(%rax), %xmm14 2722; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] 2723; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 2724; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 2725; SSE-NEXT: movdqa %xmm3, %xmm2 2726; SSE-NEXT: pandn %xmm0, %xmm2 2727; SSE-NEXT: pand %xmm3, %xmm1 2728; SSE-NEXT: por %xmm1, %xmm2 2729; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2730; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7] 2731; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2732; SSE-NEXT: movdqa %xmm10, %xmm1 2733; SSE-NEXT: pandn %xmm0, %xmm1 2734; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] 2735; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 2736; SSE-NEXT: pand %xmm10, %xmm0 2737; SSE-NEXT: por %xmm0, %xmm1 2738; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 2739; SSE-NEXT: movdqa %xmm2, %xmm0 2740; SSE-NEXT: pandn %xmm1, %xmm0 2741; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] 2742; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 2743; SSE-NEXT: movdqa %xmm3, %xmm4 2744; SSE-NEXT: pandn %xmm1, %xmm3 2745; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2746; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] 2747; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2748; SSE-NEXT: pand %xmm4, %xmm1 2749; SSE-NEXT: por %xmm1, %xmm3 2750; SSE-NEXT: pand %xmm2, %xmm3 2751; SSE-NEXT: por %xmm0, %xmm3 2752; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] 2753; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2754; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 2755; SSE-NEXT: movdqa %xmm9, %xmm1 2756; SSE-NEXT: pandn %xmm0, %xmm1 2757; SSE-NEXT: pand %xmm9, %xmm3 2758; SSE-NEXT: por %xmm3, %xmm1 2759; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] 2760; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,3] 2761; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] 2762; SSE-NEXT: movdqa %xmm0, %xmm4 2763; SSE-NEXT: pandn %xmm3, %xmm4 2764; SSE-NEXT: pand %xmm0, %xmm1 2765; SSE-NEXT: por %xmm1, %xmm4 2766; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2767; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,7,7,7] 2768; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] 2769; SSE-NEXT: movdqa %xmm11, %xmm2 2770; SSE-NEXT: pandn %xmm1, %xmm2 2771; SSE-NEXT: pand %xmm11, %xmm4 2772; SSE-NEXT: por %xmm4, %xmm2 2773; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2774; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] 2775; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2776; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] 2777; SSE-NEXT: movdqa %xmm12, %xmm4 2778; SSE-NEXT: pandn %xmm1, %xmm4 2779; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] 2780; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 2781; SSE-NEXT: pand %xmm12, %xmm1 2782; SSE-NEXT: por %xmm1, %xmm4 2783; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2784; SSE-NEXT: # xmm1 = mem[2,1,2,3] 2785; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2786; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] 2787; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] 2788; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 2789; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 2790; SSE-NEXT: movdqa %xmm15, %xmm7 2791; SSE-NEXT: pandn %xmm1, %xmm7 2792; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2793; SSE-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] 2794; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 2795; SSE-NEXT: pand %xmm15, %xmm1 2796; SSE-NEXT: por %xmm1, %xmm7 2797; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 2798; SSE-NEXT: movdqa %xmm1, %xmm3 2799; SSE-NEXT: pandn %xmm7, %xmm3 2800; SSE-NEXT: pand %xmm1, %xmm4 2801; SSE-NEXT: por %xmm4, %xmm3 2802; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 2803; SSE-NEXT: # xmm4 = mem[2,1,2,3] 2804; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2805; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] 2806; SSE-NEXT: movdqa %xmm9, %xmm7 2807; SSE-NEXT: pandn %xmm4, %xmm7 2808; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] 2809; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2810; SSE-NEXT: pand %xmm9, %xmm4 2811; SSE-NEXT: por %xmm4, %xmm7 2812; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] 2813; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2814; SSE-NEXT: movdqa %xmm0, %xmm8 2815; SSE-NEXT: pandn %xmm4, %xmm8 2816; SSE-NEXT: pand %xmm0, %xmm7 2817; SSE-NEXT: por %xmm7, %xmm8 2818; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 2819; SSE-NEXT: movdqa %xmm5, %xmm2 2820; SSE-NEXT: pandn %xmm8, %xmm2 2821; SSE-NEXT: pand %xmm5, %xmm3 2822; SSE-NEXT: por %xmm3, %xmm2 2823; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2824; SSE-NEXT: movdqa (%rsi), %xmm6 2825; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,1,2,3] 2826; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2827; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2828; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3] 2829; SSE-NEXT: movdqa %xmm12, %xmm3 2830; SSE-NEXT: pandn %xmm4, %xmm3 2831; SSE-NEXT: movdqa (%rdi), %xmm13 2832; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] 2833; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2834; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2835; SSE-NEXT: pand %xmm12, %xmm4 2836; SSE-NEXT: por %xmm4, %xmm3 2837; SSE-NEXT: movdqa (%rcx), %xmm14 2838; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3] 2839; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2840; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2841; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] 2842; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] 2843; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2844; SSE-NEXT: movdqa %xmm15, %xmm7 2845; SSE-NEXT: pandn %xmm4, %xmm7 2846; SSE-NEXT: movdqa (%rdx), %xmm8 2847; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] 2848; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill 2849; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] 2850; SSE-NEXT: pand %xmm15, %xmm4 2851; SSE-NEXT: por %xmm4, %xmm7 2852; SSE-NEXT: pand %xmm1, %xmm3 2853; SSE-NEXT: pandn %xmm7, %xmm1 2854; SSE-NEXT: por %xmm3, %xmm1 2855; SSE-NEXT: movdqa (%r9), %xmm11 2856; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] 2857; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2858; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2859; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] 2860; SSE-NEXT: movdqa %xmm9, %xmm4 2861; SSE-NEXT: pandn %xmm3, %xmm4 2862; SSE-NEXT: movdqa (%r8), %xmm7 2863; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] 2864; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2865; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 2866; SSE-NEXT: pand %xmm9, %xmm3 2867; SSE-NEXT: por %xmm3, %xmm4 2868; SSE-NEXT: pand %xmm0, %xmm4 2869; SSE-NEXT: movdqa (%rax), %xmm10 2870; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] 2871; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2872; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 2873; SSE-NEXT: pandn %xmm3, %xmm0 2874; SSE-NEXT: por %xmm4, %xmm0 2875; SSE-NEXT: pand %xmm5, %xmm1 2876; SSE-NEXT: pandn %xmm0, %xmm5 2877; SSE-NEXT: por %xmm1, %xmm5 2878; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2879; SSE-NEXT: movdqa %xmm6, %xmm0 2880; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 2881; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2882; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] 2883; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2884; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 2885; SSE-NEXT: movdqa %xmm2, %xmm1 2886; SSE-NEXT: pandn %xmm0, %xmm1 2887; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] 2888; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 2889; SSE-NEXT: pand %xmm2, %xmm0 2890; SSE-NEXT: por %xmm0, %xmm1 2891; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 2892; SSE-NEXT: movdqa %xmm2, %xmm3 2893; SSE-NEXT: pandn %xmm1, %xmm3 2894; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] 2895; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 2896; SSE-NEXT: movdqa %xmm9, %xmm1 2897; SSE-NEXT: pandn %xmm0, %xmm1 2898; SSE-NEXT: movdqa %xmm14, %xmm0 2899; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] 2900; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2901; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 2902; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] 2903; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 2904; SSE-NEXT: pand %xmm9, %xmm0 2905; SSE-NEXT: por %xmm1, %xmm0 2906; SSE-NEXT: pand %xmm2, %xmm0 2907; SSE-NEXT: por %xmm3, %xmm0 2908; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2909; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2910; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] 2911; SSE-NEXT: movdqa %xmm12, %xmm3 2912; SSE-NEXT: pandn %xmm1, %xmm3 2913; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7] 2914; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] 2915; SSE-NEXT: pand %xmm12, %xmm1 2916; SSE-NEXT: por %xmm3, %xmm1 2917; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] 2918; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 2919; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 2920; SSE-NEXT: movdqa %xmm2, %xmm4 2921; SSE-NEXT: pandn %xmm3, %xmm4 2922; SSE-NEXT: pand %xmm2, %xmm1 2923; SSE-NEXT: por %xmm1, %xmm4 2924; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 2925; SSE-NEXT: pand %xmm1, %xmm0 2926; SSE-NEXT: pandn %xmm4, %xmm1 2927; SSE-NEXT: por %xmm0, %xmm1 2928; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2929; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2930; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2931; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] 2932; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2933; SSE-NEXT: movdqa %xmm9, %xmm1 2934; SSE-NEXT: pandn %xmm0, %xmm1 2935; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2936; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] 2937; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2938; SSE-NEXT: pand %xmm9, %xmm0 2939; SSE-NEXT: por %xmm1, %xmm0 2940; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 2941; SSE-NEXT: movdqa %xmm13, %xmm1 2942; SSE-NEXT: pandn %xmm0, %xmm1 2943; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2944; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2945; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] 2946; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2947; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 2948; SSE-NEXT: movdqa %xmm15, %xmm3 2949; SSE-NEXT: pandn %xmm0, %xmm3 2950; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2951; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] 2952; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 2953; SSE-NEXT: pand %xmm15, %xmm0 2954; SSE-NEXT: por %xmm0, %xmm3 2955; SSE-NEXT: pand %xmm13, %xmm3 2956; SSE-NEXT: por %xmm1, %xmm3 2957; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 2958; SSE-NEXT: pandn %xmm3, %xmm0 2959; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2960; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2961; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,5,7] 2962; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2963; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] 2964; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 2965; SSE-NEXT: movdqa %xmm12, %xmm3 2966; SSE-NEXT: pandn %xmm1, %xmm3 2967; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2968; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,3,4,5,6,7] 2969; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 2970; SSE-NEXT: pand %xmm12, %xmm1 2971; SSE-NEXT: por %xmm1, %xmm3 2972; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2973; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] 2974; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 2975; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 2976; SSE-NEXT: movdqa %xmm2, %xmm14 2977; SSE-NEXT: pandn %xmm1, %xmm14 2978; SSE-NEXT: pand %xmm2, %xmm3 2979; SSE-NEXT: por %xmm3, %xmm14 2980; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14 2981; SSE-NEXT: por %xmm0, %xmm14 2982; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2983; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] 2984; SSE-NEXT: movdqa %xmm9, %xmm1 2985; SSE-NEXT: pandn %xmm0, %xmm1 2986; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] 2987; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 2988; SSE-NEXT: pand %xmm9, %xmm0 2989; SSE-NEXT: por %xmm1, %xmm0 2990; SSE-NEXT: movdqa %xmm13, %xmm1 2991; SSE-NEXT: pandn %xmm0, %xmm1 2992; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] 2993; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,4] 2994; SSE-NEXT: movdqa %xmm12, %xmm0 2995; SSE-NEXT: pandn %xmm3, %xmm0 2996; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[1,1,2,2,4,5,6,7] 2997; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 2998; SSE-NEXT: pand %xmm12, %xmm3 2999; SSE-NEXT: movdqa %xmm12, %xmm11 3000; SSE-NEXT: por %xmm3, %xmm0 3001; SSE-NEXT: pand %xmm13, %xmm0 3002; SSE-NEXT: por %xmm1, %xmm0 3003; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] 3004; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 3005; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 3006; SSE-NEXT: movdqa %xmm13, %xmm3 3007; SSE-NEXT: pandn %xmm1, %xmm3 3008; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] 3009; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 3010; SSE-NEXT: pand %xmm13, %xmm1 3011; SSE-NEXT: por %xmm1, %xmm3 3012; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] 3013; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 3014; SSE-NEXT: movdqa %xmm15, %xmm4 3015; SSE-NEXT: pandn %xmm1, %xmm4 3016; SSE-NEXT: pand %xmm15, %xmm3 3017; SSE-NEXT: por %xmm3, %xmm4 3018; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 3019; SSE-NEXT: movdqa %xmm2, %xmm1 3020; SSE-NEXT: pandn %xmm4, %xmm1 3021; SSE-NEXT: pand %xmm2, %xmm0 3022; SSE-NEXT: por %xmm0, %xmm1 3023; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3024; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3025; SSE-NEXT: # xmm0 = mem[0,1,1,3] 3026; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 3027; SSE-NEXT: movdqa %xmm12, %xmm1 3028; SSE-NEXT: pandn %xmm0, %xmm1 3029; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload 3030; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] 3031; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 3032; SSE-NEXT: pand %xmm12, %xmm0 3033; SSE-NEXT: por %xmm1, %xmm0 3034; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 3035; SSE-NEXT: movdqa %xmm6, %xmm3 3036; SSE-NEXT: pandn %xmm0, %xmm3 3037; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3038; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] 3039; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 3040; SSE-NEXT: movdqa %xmm9, %xmm4 3041; SSE-NEXT: pandn %xmm0, %xmm4 3042; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3043; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] 3044; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 3045; SSE-NEXT: pand %xmm9, %xmm0 3046; SSE-NEXT: por %xmm4, %xmm0 3047; SSE-NEXT: pand %xmm6, %xmm0 3048; SSE-NEXT: por %xmm3, %xmm0 3049; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3050; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] 3051; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 3052; SSE-NEXT: movdqa %xmm15, %xmm4 3053; SSE-NEXT: pandn %xmm3, %xmm4 3054; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3055; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,6,5] 3056; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 3057; SSE-NEXT: pand %xmm15, %xmm3 3058; SSE-NEXT: por %xmm3, %xmm4 3059; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3060; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] 3061; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 3062; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 3063; SSE-NEXT: movdqa %xmm1, %xmm10 3064; SSE-NEXT: pandn %xmm3, %xmm10 3065; SSE-NEXT: pand %xmm1, %xmm4 3066; SSE-NEXT: por %xmm4, %xmm10 3067; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 3068; SSE-NEXT: movdqa %xmm2, %xmm1 3069; SSE-NEXT: pandn %xmm10, %xmm1 3070; SSE-NEXT: pand %xmm2, %xmm0 3071; SSE-NEXT: por %xmm0, %xmm1 3072; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3073; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3074; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3075; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] 3076; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3077; SSE-NEXT: movdqa %xmm9, %xmm3 3078; SSE-NEXT: pandn %xmm0, %xmm3 3079; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] 3080; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 3081; SSE-NEXT: pand %xmm9, %xmm0 3082; SSE-NEXT: por %xmm3, %xmm0 3083; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 3084; SSE-NEXT: movdqa %xmm5, %xmm3 3085; SSE-NEXT: pandn %xmm0, %xmm3 3086; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3087; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3088; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1] 3089; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3090; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,5,6,4] 3091; SSE-NEXT: movdqa %xmm11, %xmm0 3092; SSE-NEXT: pandn %xmm4, %xmm0 3093; SSE-NEXT: movdqa %xmm8, %xmm1 3094; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,1,2,2,4,5,6,7] 3095; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] 3096; SSE-NEXT: pand %xmm11, %xmm4 3097; SSE-NEXT: por %xmm4, %xmm0 3098; SSE-NEXT: pand %xmm5, %xmm0 3099; SSE-NEXT: por %xmm3, %xmm0 3100; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3101; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3102; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] 3103; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3104; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] 3105; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 3106; SSE-NEXT: movdqa %xmm2, %xmm4 3107; SSE-NEXT: pandn %xmm3, %xmm4 3108; SSE-NEXT: movdqa %xmm13, %xmm5 3109; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] 3110; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 3111; SSE-NEXT: pand %xmm2, %xmm3 3112; SSE-NEXT: por %xmm3, %xmm4 3113; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] 3114; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 3115; SSE-NEXT: movdqa %xmm15, %xmm10 3116; SSE-NEXT: pandn %xmm3, %xmm10 3117; SSE-NEXT: pand %xmm15, %xmm4 3118; SSE-NEXT: por %xmm4, %xmm10 3119; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 3120; SSE-NEXT: pand %xmm3, %xmm0 3121; SSE-NEXT: pandn %xmm10, %xmm3 3122; SSE-NEXT: por %xmm0, %xmm3 3123; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3124; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] 3125; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 3126; SSE-NEXT: movdqa %xmm2, %xmm3 3127; SSE-NEXT: pandn %xmm0, %xmm3 3128; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] 3129; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3130; SSE-NEXT: pand %xmm2, %xmm0 3131; SSE-NEXT: por %xmm0, %xmm3 3132; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 3133; SSE-NEXT: movdqa %xmm0, %xmm4 3134; SSE-NEXT: pandn %xmm3, %xmm4 3135; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,1,3,4,5,6,7] 3136; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,1,0] 3137; SSE-NEXT: movdqa %xmm11, %xmm3 3138; SSE-NEXT: pandn %xmm10, %xmm3 3139; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 3140; SSE-NEXT: # xmm10 = mem[0,0,2,1,4,5,6,7] 3141; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 3142; SSE-NEXT: pand %xmm11, %xmm10 3143; SSE-NEXT: por %xmm10, %xmm3 3144; SSE-NEXT: pand %xmm0, %xmm3 3145; SSE-NEXT: por %xmm4, %xmm3 3146; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7] 3147; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] 3148; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 3149; SSE-NEXT: movdqa %xmm13, %xmm10 3150; SSE-NEXT: pandn %xmm4, %xmm10 3151; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7] 3152; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 3153; SSE-NEXT: pand %xmm13, %xmm4 3154; SSE-NEXT: por %xmm4, %xmm10 3155; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7] 3156; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 3157; SSE-NEXT: movdqa %xmm9, %xmm2 3158; SSE-NEXT: pandn %xmm4, %xmm2 3159; SSE-NEXT: pand %xmm9, %xmm10 3160; SSE-NEXT: por %xmm10, %xmm2 3161; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 3162; SSE-NEXT: movdqa %xmm7, %xmm1 3163; SSE-NEXT: pandn %xmm2, %xmm1 3164; SSE-NEXT: pand %xmm7, %xmm3 3165; SSE-NEXT: por %xmm3, %xmm1 3166; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3167; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3168; SSE-NEXT: # xmm2 = mem[0,1,1,3] 3169; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 3170; SSE-NEXT: movdqa %xmm1, %xmm3 3171; SSE-NEXT: pandn %xmm2, %xmm3 3172; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3173; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] 3174; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 3175; SSE-NEXT: pand %xmm1, %xmm2 3176; SSE-NEXT: por %xmm3, %xmm2 3177; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3178; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] 3179; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] 3180; SSE-NEXT: movdqa %xmm9, %xmm5 3181; SSE-NEXT: pandn %xmm3, %xmm5 3182; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3183; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] 3184; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 3185; SSE-NEXT: pand %xmm9, %xmm3 3186; SSE-NEXT: por %xmm5, %xmm3 3187; SSE-NEXT: pand %xmm6, %xmm3 3188; SSE-NEXT: pandn %xmm2, %xmm6 3189; SSE-NEXT: por %xmm3, %xmm6 3190; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3191; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] 3192; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 3193; SSE-NEXT: movdqa %xmm15, %xmm3 3194; SSE-NEXT: pandn %xmm2, %xmm3 3195; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3196; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] 3197; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 3198; SSE-NEXT: pand %xmm15, %xmm2 3199; SSE-NEXT: por %xmm2, %xmm3 3200; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3201; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7] 3202; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 3203; SSE-NEXT: movdqa %xmm13, %xmm5 3204; SSE-NEXT: pandn %xmm2, %xmm5 3205; SSE-NEXT: pand %xmm13, %xmm3 3206; SSE-NEXT: por %xmm3, %xmm5 3207; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 3208; SSE-NEXT: pand %xmm14, %xmm6 3209; SSE-NEXT: pandn %xmm5, %xmm14 3210; SSE-NEXT: por %xmm6, %xmm14 3211; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3212; SSE-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] 3213; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 3214; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 3215; SSE-NEXT: movdqa %xmm3, %xmm2 3216; SSE-NEXT: pandn %xmm1, %xmm2 3217; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] 3218; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 3219; SSE-NEXT: pand %xmm3, %xmm1 3220; SSE-NEXT: movdqa %xmm3, %xmm4 3221; SSE-NEXT: por %xmm1, %xmm2 3222; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3223; SSE-NEXT: # xmm1 = mem[0,2,1,3,4,5,6,7] 3224; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] 3225; SSE-NEXT: movdqa %xmm11, %xmm3 3226; SSE-NEXT: pandn %xmm1, %xmm3 3227; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] 3228; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 3229; SSE-NEXT: pand %xmm11, %xmm1 3230; SSE-NEXT: por %xmm1, %xmm3 3231; SSE-NEXT: pand %xmm0, %xmm3 3232; SSE-NEXT: pandn %xmm2, %xmm0 3233; SSE-NEXT: por %xmm3, %xmm0 3234; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3235; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] 3236; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 3237; SSE-NEXT: movdqa %xmm13, %xmm2 3238; SSE-NEXT: pandn %xmm1, %xmm2 3239; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] 3240; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 3241; SSE-NEXT: pand %xmm13, %xmm1 3242; SSE-NEXT: por %xmm1, %xmm2 3243; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7] 3244; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 3245; SSE-NEXT: movdqa %xmm9, %xmm3 3246; SSE-NEXT: pandn %xmm1, %xmm3 3247; SSE-NEXT: pand %xmm9, %xmm2 3248; SSE-NEXT: por %xmm2, %xmm3 3249; SSE-NEXT: pand %xmm7, %xmm0 3250; SSE-NEXT: pandn %xmm3, %xmm7 3251; SSE-NEXT: por %xmm0, %xmm7 3252; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload 3253; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] 3254; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 3255; SSE-NEXT: pand %xmm13, %xmm0 3256; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3257; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7] 3258; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3259; SSE-NEXT: pandn %xmm1, %xmm13 3260; SSE-NEXT: por %xmm0, %xmm13 3261; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3262; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] 3263; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 3264; SSE-NEXT: movdqa %xmm4, %xmm3 3265; SSE-NEXT: pand %xmm4, %xmm0 3266; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3267; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,6,6,7] 3268; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 3269; SSE-NEXT: pandn %xmm1, %xmm3 3270; SSE-NEXT: por %xmm0, %xmm3 3271; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 3272; SSE-NEXT: pand %xmm0, %xmm3 3273; SSE-NEXT: pandn %xmm13, %xmm0 3274; SSE-NEXT: por %xmm3, %xmm0 3275; SSE-NEXT: movdqa %xmm0, %xmm3 3276; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3277; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] 3278; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 3279; SSE-NEXT: movdqa %xmm9, %xmm1 3280; SSE-NEXT: pandn %xmm0, %xmm1 3281; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3282; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] 3283; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] 3284; SSE-NEXT: pand %xmm9, %xmm0 3285; SSE-NEXT: por %xmm1, %xmm0 3286; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3287; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,7,7,7] 3288; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] 3289; SSE-NEXT: movdqa %xmm11, %xmm2 3290; SSE-NEXT: pandn %xmm1, %xmm2 3291; SSE-NEXT: pand %xmm11, %xmm0 3292; SSE-NEXT: por %xmm0, %xmm2 3293; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 3294; SSE-NEXT: pand %xmm0, %xmm3 3295; SSE-NEXT: pandn %xmm2, %xmm0 3296; SSE-NEXT: por %xmm3, %xmm0 3297; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3298; SSE-NEXT: # xmm1 = mem[2,2,3,3] 3299; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] 3300; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 3301; SSE-NEXT: pand %xmm9, %xmm2 3302; SSE-NEXT: pandn %xmm1, %xmm9 3303; SSE-NEXT: por %xmm2, %xmm9 3304; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] 3305; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 3306; SSE-NEXT: pand %xmm15, %xmm1 3307; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3308; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] 3309; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] 3310; SSE-NEXT: pandn %xmm2, %xmm15 3311; SSE-NEXT: por %xmm1, %xmm15 3312; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 3313; SSE-NEXT: pand %xmm1, %xmm15 3314; SSE-NEXT: pandn %xmm9, %xmm1 3315; SSE-NEXT: por %xmm15, %xmm1 3316; SSE-NEXT: movdqa %xmm1, %xmm3 3317; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,3,4,5,6,7] 3318; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 3319; SSE-NEXT: pand %xmm11, %xmm1 3320; SSE-NEXT: pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3321; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,5,7] 3322; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 3323; SSE-NEXT: pandn %xmm2, %xmm11 3324; SSE-NEXT: por %xmm1, %xmm11 3325; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 3326; SSE-NEXT: pand %xmm4, %xmm11 3327; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] 3328; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 3329; SSE-NEXT: pandn %xmm1, %xmm4 3330; SSE-NEXT: por %xmm11, %xmm4 3331; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 3332; SSE-NEXT: pand %xmm1, %xmm4 3333; SSE-NEXT: pandn %xmm3, %xmm1 3334; SSE-NEXT: por %xmm1, %xmm4 3335; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 3336; SSE-NEXT: movdqa %xmm4, 32(%rax) 3337; SSE-NEXT: movdqa %xmm0, 96(%rax) 3338; SSE-NEXT: movdqa %xmm7, 112(%rax) 3339; SSE-NEXT: movdqa %xmm14, 176(%rax) 3340; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3341; SSE-NEXT: movaps %xmm0, (%rax) 3342; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3343; SSE-NEXT: movaps %xmm0, 16(%rax) 3344; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3345; SSE-NEXT: movaps %xmm0, 64(%rax) 3346; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3347; SSE-NEXT: movaps %xmm0, 128(%rax) 3348; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3349; SSE-NEXT: movaps %xmm0, 144(%rax) 3350; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3351; SSE-NEXT: movaps %xmm0, 80(%rax) 3352; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3353; SSE-NEXT: movaps %xmm0, 48(%rax) 3354; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3355; SSE-NEXT: movaps %xmm0, 160(%rax) 3356; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3357; SSE-NEXT: movaps %xmm0, 208(%rax) 3358; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3359; SSE-NEXT: movaps %xmm0, 192(%rax) 3360; SSE-NEXT: addq $360, %rsp # imm = 0x168 3361; SSE-NEXT: retq 3362; 3363; AVX-LABEL: store_i8_stride7_vf32: 3364; AVX: # %bb.0: 3365; AVX-NEXT: subq $216, %rsp 3366; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 3367; AVX-NEXT: vmovdqa 16(%rax), %xmm14 3368; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u] 3369; AVX-NEXT: vmovdqa 16(%r9), %xmm2 3370; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3371; AVX-NEXT: vmovdqa 16(%r8), %xmm3 3372; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3373; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 3374; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3375; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] 3376; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 3377; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u] 3378; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm1 3379; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u] 3380; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 3381; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 3382; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u] 3383; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 3384; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u] 3385; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm3 3386; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 3387; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 3388; AVX-NEXT: vmovdqa 16(%rcx), %xmm1 3389; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3390; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] 3391; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm0 3392; AVX-NEXT: vmovdqa 16(%rdx), %xmm2 3393; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3394; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u] 3395; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm3 3396; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 3397; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3398; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3399; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] 3400; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15 3401; AVX-NEXT: vmovdqa 16(%rsi), %xmm10 3402; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128] 3403; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm0 3404; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 3405; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] 3406; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 3407; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] 3408; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] 3409; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3410; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 3411; AVX-NEXT: vandnps %ymm15, %ymm2, %ymm15 3412; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 3413; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 3414; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 3415; AVX-NEXT: vandnps %ymm7, %ymm2, %ymm7 3416; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 3417; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0 3418; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3419; AVX-NEXT: vmovdqa (%r9), %xmm7 3420; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm0 3421; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3422; AVX-NEXT: vmovdqa (%r8), %xmm3 3423; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3424; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm2 3425; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 3426; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 3427; AVX-NEXT: vmovdqa (%rax), %xmm8 3428; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3429; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm2 3430; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 3431; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128] 3432; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm2 3433; AVX-NEXT: vmovdqa %xmm5, %xmm8 3434; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 3435; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3436; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] 3437; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 3438; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15 3439; AVX-NEXT: vmovdqa (%rcx), %xmm3 3440; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3441; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm0 3442; AVX-NEXT: vmovdqa (%rdx), %xmm13 3443; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm2 3444; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 3445; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] 3446; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3447; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] 3448; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 3449; AVX-NEXT: vmovdqa (%rsi), %xmm5 3450; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm2 3451; AVX-NEXT: vmovdqa (%rdi), %xmm3 3452; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] 3453; AVX-NEXT: vpor %xmm2, %xmm11, %xmm2 3454; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 3455; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3456; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] 3457; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm11 3458; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 3459; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 3460; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 3461; AVX-NEXT: vandps %ymm4, %ymm11, %ymm11 3462; AVX-NEXT: vorps %ymm0, %ymm11, %ymm0 3463; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 3464; AVX-NEXT: vandnps %ymm15, %ymm11, %ymm15 3465; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 3466; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 3467; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3468; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3469; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm0 3470; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3471; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3472; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 3473; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 3474; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] 3475; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 3476; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] 3477; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] 3478; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11 3479; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15 3480; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 3481; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm0 3482; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] 3483; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 3484; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3485; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3486; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3487; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] 3488; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] 3489; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 3490; AVX-NEXT: vandnps %ymm12, %ymm4, %ymm12 3491; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 3492; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 3493; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 3494; AVX-NEXT: vandnps %ymm15, %ymm4, %ymm12 3495; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 3496; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 3497; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3498; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] 3499; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] 3500; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] 3501; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm0 3502; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 3503; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 3504; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3505; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] 3506; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] 3507; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm4 3508; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 3509; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 3510; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 3511; AVX-NEXT: vandps %ymm4, %ymm6, %ymm6 3512; AVX-NEXT: vorps %ymm0, %ymm6, %ymm0 3513; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] 3514; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3515; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] 3516; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] 3517; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 3518; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] 3519; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 3520; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm6 3521; AVX-NEXT: vorps %ymm6, %ymm0, %ymm0 3522; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6 3523; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero 3524; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3525; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15] 3526; AVX-NEXT: vpor %xmm6, %xmm12, %xmm1 3527; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3528; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] 3529; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero 3530; AVX-NEXT: vpor %xmm6, %xmm0, %xmm0 3531; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 3532; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 3533; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] 3534; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm2 3535; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3536; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3537; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] 3538; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2 3539; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] 3540; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] 3541; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 3542; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 3543; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1 3544; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 3545; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 3546; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3547; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] 3548; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3549; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3550; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] 3551; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] 3552; AVX-NEXT: vpor %xmm2, %xmm6, %xmm2 3553; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] 3554; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero 3555; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 3556; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 3557; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 3558; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 3559; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 3560; AVX-NEXT: vorps %ymm2, %ymm1, %ymm6 3561; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] 3562; AVX-NEXT: vpmovsxdq {{.*#+}} xmm2 = [16777216,197120] 3563; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm3 3564; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 3565; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] 3566; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5] 3567; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm7 3568; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 3569; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 3570; AVX-NEXT: vandps %ymm4, %ymm5, %ymm4 3571; AVX-NEXT: vorps %ymm1, %ymm4, %ymm4 3572; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] 3573; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload 3574; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 3575; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] 3576; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] 3577; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero 3578; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15] 3579; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 3580; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] 3581; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 3582; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 3583; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 3584; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4 3585; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 3586; AVX-NEXT: vorps %ymm5, %ymm4, %ymm5 3587; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3588; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] 3589; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3590; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 3591; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3592; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] 3593; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3 3594; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 3595; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] 3596; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 3597; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 3598; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 3599; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] 3600; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3601; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] 3602; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 3603; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3604; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] 3605; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 3606; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 3607; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 3608; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2 3609; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1 3610; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 3611; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 3612; AVX-NEXT: vmovaps %ymm1, (%rax) 3613; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3614; AVX-NEXT: vmovaps %ymm1, 128(%rax) 3615; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3616; AVX-NEXT: vmovaps %ymm1, 32(%rax) 3617; AVX-NEXT: vmovaps %ymm5, 96(%rax) 3618; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3619; AVX-NEXT: vmovaps %ymm1, 160(%rax) 3620; AVX-NEXT: vmovaps %ymm6, 64(%rax) 3621; AVX-NEXT: vmovdqa %xmm0, 192(%rax) 3622; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3623; AVX-NEXT: vmovaps %xmm0, 208(%rax) 3624; AVX-NEXT: addq $216, %rsp 3625; AVX-NEXT: vzeroupper 3626; AVX-NEXT: retq 3627; 3628; AVX2-LABEL: store_i8_stride7_vf32: 3629; AVX2: # %bb.0: 3630; AVX2-NEXT: pushq %rax 3631; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3632; AVX2-NEXT: vmovdqa (%rdi), %ymm4 3633; AVX2-NEXT: vmovdqa (%rsi), %ymm6 3634; AVX2-NEXT: vmovdqa (%rdx), %ymm3 3635; AVX2-NEXT: vmovdqa (%rcx), %ymm5 3636; AVX2-NEXT: vmovdqa (%r8), %ymm7 3637; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3638; AVX2-NEXT: vmovdqa (%r9), %ymm2 3639; AVX2-NEXT: vmovdqa (%rax), %ymm1 3640; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3641; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] 3642; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 3643; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] 3644; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] 3645; AVX2-NEXT: # ymm9 = mem[0,1,0,1] 3646; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 3647; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 3648; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] 3649; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 3650; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] 3651; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] 3652; AVX2-NEXT: # ymm10 = mem[0,1,0,1] 3653; AVX2-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 3654; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 3655; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 3656; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 3657; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] 3658; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero 3659; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 3660; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 3661; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] 3662; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 3663; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 3664; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 3665; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 3666; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 3667; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3668; AVX2-NEXT: vmovdqa (%rdx), %xmm10 3669; AVX2-NEXT: vmovdqa (%rcx), %xmm11 3670; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 3671; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 3672; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,1] 3673; AVX2-NEXT: vmovdqa (%rdi), %xmm14 3674; AVX2-NEXT: vmovdqa (%rsi), %xmm0 3675; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] 3676; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 3677; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 3678; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] 3679; AVX2-NEXT: vpblendvb %ymm12, %ymm8, %ymm9, %ymm8 3680; AVX2-NEXT: vmovdqa (%r9), %xmm12 3681; AVX2-NEXT: vmovdqa (%r8), %xmm13 3682; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 3683; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 3684; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 3685; AVX2-NEXT: vmovdqa (%rax), %xmm15 3686; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] 3687; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] 3688; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 3689; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 3690; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm7, %ymm1 3691; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 3692; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm1, %ymm1 3693; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3694; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero 3695; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] 3696; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1 3697; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] 3698; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] 3699; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 3700; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3701; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 3702; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 3703; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 3704; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] 3705; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero 3706; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 3707; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 3708; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] 3709; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 3710; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 3711; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 3712; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 3713; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 3714; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] 3715; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 3716; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 3717; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 3718; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 3719; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3720; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] 3721; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 3722; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 3723; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 3724; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 3725; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7] 3726; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] 3727; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] 3728; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 3729; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 3730; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 3731; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 3732; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] 3733; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 3734; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] 3735; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] 3736; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 3737; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 3738; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 3739; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] 3740; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero 3741; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 3742; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3743; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] 3744; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 3745; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] 3746; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 3747; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero 3748; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 3749; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3750; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 3751; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 3752; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] 3753; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] 3754; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] 3755; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 3756; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 3757; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 3758; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] 3759; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero 3760; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 3761; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 3762; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero 3763; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] 3764; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 3765; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 3766; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 3767; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 3768; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 3769; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero 3770; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 3771; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 3772; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 3773; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 3774; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] 3775; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 3776; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 3777; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 3778; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] 3779; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero 3780; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 3781; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero 3782; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero 3783; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm3 3784; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] 3785; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 3786; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] 3787; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero 3788; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2 3789; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 3790; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] 3791; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 3792; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 3793; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 3794; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3795; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) 3796; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) 3797; AVX2-NEXT: vmovdqa %ymm10, (%rax) 3798; AVX2-NEXT: vmovdqa %ymm0, 128(%rax) 3799; AVX2-NEXT: vmovdqa %ymm9, 32(%rax) 3800; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3801; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 3802; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3803; AVX2-NEXT: vmovaps %ymm0, 192(%rax) 3804; AVX2-NEXT: popq %rax 3805; AVX2-NEXT: vzeroupper 3806; AVX2-NEXT: retq 3807; 3808; AVX2-FP-LABEL: store_i8_stride7_vf32: 3809; AVX2-FP: # %bb.0: 3810; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3811; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 3812; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm3 3813; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 3814; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 3815; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm9 3816; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm10 3817; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 3818; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 3819; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 3820; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 3821; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm15 3822; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] 3823; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 3824; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 3825; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] 3826; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 3827; AVX2-FP-NEXT: vmovdqa (%rax), %xmm11 3828; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] 3829; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 3830; AVX2-FP-NEXT: vmovdqa (%r9), %xmm12 3831; AVX2-FP-NEXT: vmovdqa (%r8), %xmm14 3832; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 3833; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 3834; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3835; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 3836; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 3837; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 3838; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 3839; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero 3840; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] 3841; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 3842; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 3843; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] 3844; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] 3845; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 3846; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3847; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 3848; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 3849; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 3850; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] 3851; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] 3852; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero 3853; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 3854; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 3855; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 3856; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 3857; AVX2-FP-NEXT: vmovdqa (%r8), %ymm6 3858; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 3859; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 3860; AVX2-FP-NEXT: vmovdqa (%r9), %ymm8 3861; AVX2-FP-NEXT: vmovdqa (%rax), %ymm7 3862; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] 3863; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 3864; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 3865; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 3866; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 3867; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 3868; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] 3869; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 3870; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] 3871; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] 3872; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] 3873; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 3874; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] 3875; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 3876; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 3877; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 3878; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 3879; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] 3880; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero 3881; AVX2-FP-NEXT: vpor %ymm10, %ymm11, %ymm10 3882; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 3883; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero 3884; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero 3885; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 3886; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 3887; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 3888; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 3889; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] 3890; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero 3891; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 3892; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 3893; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] 3894; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 3895; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 3896; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 3897; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 3898; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 3899; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] 3900; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero 3901; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 3902; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 3903; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero 3904; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] 3905; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 3906; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 3907; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 3908; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 3909; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero 3910; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero 3911; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 3912; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 3913; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 3914; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 3915; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] 3916; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 3917; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 3918; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 3919; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] 3920; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero 3921; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 3922; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 3923; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero 3924; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero 3925; AVX2-FP-NEXT: vpor %ymm13, %ymm14, %ymm13 3926; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 3927; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] 3928; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 3929; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] 3930; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero 3931; AVX2-FP-NEXT: vpor %ymm13, %ymm14, %ymm13 3932; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 3933; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] 3934; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] 3935; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] 3936; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 3937; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 3938; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 3939; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] 3940; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero 3941; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 3942; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero 3943; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero 3944; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0 3945; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] 3946; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 3947; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] 3948; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero 3949; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 3950; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 3951; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] 3952; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 3953; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 3954; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 3955; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3956; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 3957; AVX2-FP-NEXT: vmovdqa %ymm12, 128(%rax) 3958; AVX2-FP-NEXT: vmovdqa %ymm11, 160(%rax) 3959; AVX2-FP-NEXT: vmovdqa %ymm9, (%rax) 3960; AVX2-FP-NEXT: vmovdqa %ymm10, 192(%rax) 3961; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) 3962; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%rax) 3963; AVX2-FP-NEXT: vzeroupper 3964; AVX2-FP-NEXT: retq 3965; 3966; AVX2-FCP-LABEL: store_i8_stride7_vf32: 3967; AVX2-FCP: # %bb.0: 3968; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3969; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 3970; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm3 3971; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0 3972; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 3973; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 3974; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm10 3975; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 3976; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 3977; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 3978; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 3979; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15 3980; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] 3981; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 3982; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 3983; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] 3984; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 3985; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm11 3986; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6] 3987; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] 3988; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] 3989; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 3990; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm12 3991; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm14 3992; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 3993; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 3994; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 3995; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 3996; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 3997; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 3998; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 3999; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero 4000; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] 4001; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 4002; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 4003; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] 4004; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] 4005; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 4006; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 4007; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 4008; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 4009; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 4010; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] 4011; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] 4012; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero 4013; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 4014; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 4015; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 4016; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 4017; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm6 4018; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 4019; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 4020; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm8 4021; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm7 4022; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] 4023; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4024; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] 4025; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 4026; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4027; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4028; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] 4029; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 4030; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7] 4031; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] 4032; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 4033; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] 4034; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4035; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] 4036; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 4037; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 4038; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 4039; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 4040; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] 4041; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero 4042; AVX2-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 4043; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 4044; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero 4045; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero 4046; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 4047; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 4048; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] 4049; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 4050; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 4051; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] 4052; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 4053; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] 4054; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero 4055; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 4056; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 4057; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] 4058; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 4059; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 4060; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 4061; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] 4062; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero 4063; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 4064; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 4065; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero 4066; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero 4067; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 4068; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 4069; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 4070; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 4071; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] 4072; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero 4073; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 4074; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 4075; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] 4076; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 4077; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 4078; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 4079; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 4080; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 4081; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] 4082; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero 4083; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 4084; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 4085; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero 4086; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] 4087; AVX2-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 4088; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 4089; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 4090; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 4091; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero 4092; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero 4093; AVX2-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 4094; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] 4095; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 4096; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] 4097; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] 4098; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 4099; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 4100; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 4101; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] 4102; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero 4103; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 4104; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero 4105; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero 4106; AVX2-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 4107; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] 4108; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 4109; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] 4110; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero 4111; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 4112; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 4113; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] 4114; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 4115; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 4116; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 4117; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4118; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 4119; AVX2-FCP-NEXT: vmovdqa %ymm10, 128(%rax) 4120; AVX2-FCP-NEXT: vmovdqa %ymm12, 160(%rax) 4121; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rax) 4122; AVX2-FCP-NEXT: vmovdqa %ymm11, 192(%rax) 4123; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rax) 4124; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%rax) 4125; AVX2-FCP-NEXT: vzeroupper 4126; AVX2-FCP-NEXT: retq 4127; 4128; AVX512-LABEL: store_i8_stride7_vf32: 4129; AVX512: # %bb.0: 4130; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 4131; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 4132; AVX512-NEXT: vmovdqa (%rdi), %ymm3 4133; AVX512-NEXT: vmovdqa (%rsi), %ymm4 4134; AVX512-NEXT: vmovdqa (%rdx), %ymm5 4135; AVX512-NEXT: vmovdqa (%rcx), %ymm6 4136; AVX512-NEXT: vmovdqa (%r8), %ymm1 4137; AVX512-NEXT: vmovdqa (%r9), %ymm2 4138; AVX512-NEXT: vmovdqa (%r10), %ymm0 4139; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] 4140; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] 4141; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 4142; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] 4143; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero 4144; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero 4145; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 4146; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] 4147; AVX512-NEXT: vporq %zmm7, %zmm8, %zmm7 4148; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero 4149; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero 4150; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 4151; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] 4152; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] 4153; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] 4154; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4155; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4156; AVX512-NEXT: vporq %zmm8, %zmm9, %zmm8 4157; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7)) 4158; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] 4159; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 4160; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18 4161; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] 4162; AVX512-NEXT: vpermi2d %zmm7, %zmm9, %zmm10 4163; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] 4164; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 4165; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 4166; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] 4167; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero 4168; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] 4169; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 4170; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 4171; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4172; AVX512-NEXT: vporq %zmm7, %zmm9, %zmm7 4173; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm10 ^ (mem & (zmm7 ^ zmm10)) 4174; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8)) 4175; AVX512-NEXT: vmovdqa (%rsi), %xmm9 4176; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero 4177; AVX512-NEXT: vmovdqa (%rdi), %xmm10 4178; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] 4179; AVX512-NEXT: vpor %xmm8, %xmm11, %xmm8 4180; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 4181; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4182; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm11, %zmm8 4183; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 4184; AVX512-NEXT: vmovdqa (%rcx), %xmm14 4185; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] 4186; AVX512-NEXT: vmovdqa (%rdx), %xmm15 4187; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u] 4188; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 4189; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 4190; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4191; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 4192; AVX512-NEXT: vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5] 4193; AVX512-NEXT: vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) 4194; AVX512-NEXT: vmovdqa (%r9), %xmm11 4195; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] 4196; AVX512-NEXT: vmovdqa (%r8), %xmm12 4197; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero 4198; AVX512-NEXT: vpor %xmm8, %xmm13, %xmm8 4199; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 4200; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4201; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 4202; AVX512-NEXT: vpermq {{.*#+}} zmm17 = zmm8[0,1,0,1,4,5,4,5] 4203; AVX512-NEXT: vmovdqa (%r10), %xmm13 4204; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 4205; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[1,1,0,0,4,5,6,7] 4206; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 4207; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 4208; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm0[0,0,1,0,4,4,5,4] 4209; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm17)) 4210; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) 4211; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] 4212; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero 4213; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 4214; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] 4215; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 4216; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 4217; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 4218; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] 4219; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] 4220; AVX512-NEXT: vpor %ymm1, %ymm14, %ymm1 4221; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 4222; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4223; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4224; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 4225; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) 4226; AVX512-NEXT: vmovdqa64 %ymm19, %ymm14 4227; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero 4228; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] 4229; AVX512-NEXT: vpor %ymm0, %ymm9, %ymm0 4230; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] 4231; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 4232; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4233; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 4234; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,5,6] 4235; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] 4236; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4237; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 4238; AVX512-NEXT: vmovdqa64 %ymm18, %ymm11 4239; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] 4240; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4241; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem) 4242; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) 4243; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero 4244; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] 4245; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 4246; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4247; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero 4248; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] 4249; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm1 4250; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 4251; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 4252; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29] 4253; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero 4254; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 4255; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4256; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 4257; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 4258; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 4259; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) 4260; AVX512-NEXT: vmovdqa %ymm2, 192(%rax) 4261; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) 4262; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rax) 4263; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) 4264; AVX512-NEXT: vzeroupper 4265; AVX512-NEXT: retq 4266; 4267; AVX512-FCP-LABEL: store_i8_stride7_vf32: 4268; AVX512-FCP: # %bb.0: 4269; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4270; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4271; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 4272; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 4273; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm5 4274; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 4275; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 4276; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 4277; AVX512-FCP-NEXT: vmovdqa64 (%r10), %ymm17 4278; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 4279; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero 4280; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 4281; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] 4282; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 4283; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 4284; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4285; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 4286; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] 4287; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm11 4288; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] 4289; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm12 4290; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] 4291; AVX512-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 4292; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 4293; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4294; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 4295; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] 4296; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) 4297; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm10 4298; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] 4299; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] 4300; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm7 4301; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 4302; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] 4303; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm16 4304; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm13 4305; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] 4306; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm14 4307; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero 4308; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 4309; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 4310; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4311; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 4312; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] 4313; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm16 ^ (mem & (zmm7 ^ zmm16)) 4314; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm15)) 4315; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] 4316; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero 4317; AVX512-FCP-NEXT: vpor %ymm0, %ymm15, %ymm0 4318; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] 4319; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 4320; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] 4321; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 4322; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] 4323; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] 4324; AVX512-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 4325; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 4326; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4327; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 4328; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 4329; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0)) 4330; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero 4331; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] 4332; AVX512-FCP-NEXT: vpor %ymm0, %ymm8, %ymm0 4333; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 4334; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 4335; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 4336; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 4337; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6] 4338; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] 4339; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] 4340; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 4341; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8 4342; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm12 4343; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u] 4344; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 4345; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm0 & mem) 4346; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm9)) 4347; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] 4348; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] 4349; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 4350; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] 4351; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero 4352; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero 4353; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4354; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4355; AVX512-FCP-NEXT: vporq %zmm0, %zmm9, %zmm0 4356; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero 4357; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero 4358; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4359; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4360; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] 4361; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] 4362; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 4363; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] 4364; AVX512-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9 4365; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0)) 4366; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] 4367; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 4368; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 4369; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] 4370; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero 4371; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] 4372; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 4373; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] 4374; AVX512-FCP-NEXT: vporq %zmm0, %zmm10, %zmm0 4375; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 4376; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,5,4,0,5,0,4,0] 4377; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 4378; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 4379; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 4380; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 4381; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0)) 4382; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm9)) 4383; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero 4384; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] 4385; AVX512-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 4386; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4387; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero 4388; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] 4389; AVX512-FCP-NEXT: vpor %ymm4, %ymm3, %ymm3 4390; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 4391; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) 4392; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] 4393; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero 4394; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 4395; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4396; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 4397; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 4398; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 4399; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) 4400; AVX512-FCP-NEXT: vmovdqa %ymm1, 192(%rax) 4401; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 4402; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 4403; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 4404; AVX512-FCP-NEXT: vzeroupper 4405; AVX512-FCP-NEXT: retq 4406; 4407; AVX512DQ-LABEL: store_i8_stride7_vf32: 4408; AVX512DQ: # %bb.0: 4409; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 4410; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 4411; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 4412; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 4413; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm5 4414; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm6 4415; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 4416; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 4417; AVX512DQ-NEXT: vmovdqa (%r10), %ymm0 4418; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] 4419; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] 4420; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 4421; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] 4422; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero 4423; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero 4424; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 4425; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] 4426; AVX512DQ-NEXT: vporq %zmm7, %zmm8, %zmm7 4427; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero 4428; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero 4429; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 4430; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] 4431; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] 4432; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] 4433; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4434; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4435; AVX512DQ-NEXT: vporq %zmm8, %zmm9, %zmm8 4436; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7)) 4437; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] 4438; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 4439; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18 4440; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] 4441; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm9, %zmm10 4442; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] 4443; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 4444; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 4445; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] 4446; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero 4447; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] 4448; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 4449; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 4450; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4451; AVX512DQ-NEXT: vporq %zmm7, %zmm9, %zmm7 4452; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm10 ^ (mem & (zmm7 ^ zmm10)) 4453; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8)) 4454; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 4455; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero 4456; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 4457; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] 4458; AVX512DQ-NEXT: vpor %xmm8, %xmm11, %xmm8 4459; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 4460; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4461; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm11, %zmm8 4462; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 4463; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm14 4464; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] 4465; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 4466; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u] 4467; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 4468; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 4469; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4470; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 4471; AVX512DQ-NEXT: vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5] 4472; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) 4473; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 4474; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] 4475; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12 4476; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero 4477; AVX512DQ-NEXT: vpor %xmm8, %xmm13, %xmm8 4478; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 4479; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4480; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 4481; AVX512DQ-NEXT: vpermq {{.*#+}} zmm17 = zmm8[0,1,0,1,4,5,4,5] 4482; AVX512DQ-NEXT: vmovdqa (%r10), %xmm13 4483; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 4484; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[1,1,0,0,4,5,6,7] 4485; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 4486; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 4487; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm0[0,0,1,0,4,4,5,4] 4488; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm17)) 4489; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) 4490; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] 4491; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero 4492; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 4493; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] 4494; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 4495; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 4496; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 4497; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] 4498; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] 4499; AVX512DQ-NEXT: vpor %ymm1, %ymm14, %ymm1 4500; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 4501; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4502; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4503; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 4504; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) 4505; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm14 4506; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero 4507; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] 4508; AVX512DQ-NEXT: vpor %ymm0, %ymm9, %ymm0 4509; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] 4510; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 4511; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4512; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 4513; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,5,6] 4514; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] 4515; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4516; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 4517; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm11 4518; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] 4519; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4520; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem) 4521; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) 4522; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero 4523; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] 4524; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 4525; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4526; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero 4527; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] 4528; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm1 4529; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 4530; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 4531; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29] 4532; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero 4533; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 4534; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4535; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 4536; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 4537; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) 4538; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) 4539; AVX512DQ-NEXT: vmovdqa %ymm2, 192(%rax) 4540; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) 4541; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rax) 4542; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) 4543; AVX512DQ-NEXT: vzeroupper 4544; AVX512DQ-NEXT: retq 4545; 4546; AVX512DQ-FCP-LABEL: store_i8_stride7_vf32: 4547; AVX512DQ-FCP: # %bb.0: 4548; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4549; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4550; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 4551; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 4552; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm5 4553; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 4554; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 4555; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 4556; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %ymm17 4557; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 4558; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero 4559; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 4560; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] 4561; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 4562; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 4563; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4564; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 4565; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] 4566; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm11 4567; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] 4568; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm12 4569; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] 4570; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 4571; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 4572; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4573; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 4574; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] 4575; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) 4576; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm10 4577; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] 4578; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] 4579; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm7 4580; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 4581; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] 4582; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm16 4583; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm13 4584; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] 4585; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm14 4586; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero 4587; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 4588; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 4589; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4590; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 4591; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] 4592; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm16 ^ (mem & (zmm7 ^ zmm16)) 4593; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm15)) 4594; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] 4595; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero 4596; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm15, %ymm0 4597; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] 4598; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 4599; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] 4600; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 4601; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] 4602; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] 4603; AVX512DQ-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 4604; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 4605; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4606; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 4607; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 4608; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0)) 4609; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero 4610; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] 4611; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm8, %ymm0 4612; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 4613; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 4614; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] 4615; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 4616; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6] 4617; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] 4618; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] 4619; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 4620; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8 4621; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm12 4622; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u] 4623; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 4624; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm0 & mem) 4625; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm9)) 4626; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] 4627; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] 4628; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 4629; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] 4630; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero 4631; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero 4632; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4633; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4634; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm9, %zmm0 4635; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero 4636; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero 4637; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 4638; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4639; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] 4640; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] 4641; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 4642; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] 4643; AVX512DQ-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9 4644; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0)) 4645; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] 4646; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 4647; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 4648; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] 4649; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero 4650; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] 4651; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 4652; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] 4653; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm10, %zmm0 4654; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 4655; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,5,4,0,5,0,4,0] 4656; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 4657; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 4658; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 4659; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 4660; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0)) 4661; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm9)) 4662; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero 4663; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] 4664; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 4665; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4666; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero 4667; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] 4668; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm3, %ymm3 4669; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 4670; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) 4671; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] 4672; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero 4673; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 4674; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 4675; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 4676; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 4677; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 4678; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) 4679; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 192(%rax) 4680; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 4681; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 4682; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) 4683; AVX512DQ-FCP-NEXT: vzeroupper 4684; AVX512DQ-FCP-NEXT: retq 4685; 4686; AVX512BW-LABEL: store_i8_stride7_vf32: 4687; AVX512BW: # %bb.0: 4688; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4689; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 4690; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 4691; AVX512BW-NEXT: vmovdqa (%rsi), %ymm2 4692; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 4693; AVX512BW-NEXT: vmovdqa (%rcx), %ymm3 4694; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] 4695; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero 4696; AVX512BW-NEXT: vpor %ymm0, %ymm5, %ymm0 4697; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 4698; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 4699; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] 4700; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4701; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 4702; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 4703; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero 4704; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] 4705; AVX512BW-NEXT: vpor %ymm0, %ymm6, %ymm0 4706; AVX512BW-NEXT: vmovdqa (%rdx), %xmm12 4707; AVX512BW-NEXT: vmovdqa (%rcx), %xmm14 4708; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 4709; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 4710; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 4711; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 4712; AVX512BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 4713; AVX512BW-NEXT: kmovq %rcx, %k1 4714; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} 4715; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] 4716; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero 4717; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] 4718; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero 4719; AVX512BW-NEXT: vpor %ymm7, %ymm9, %ymm7 4720; AVX512BW-NEXT: vmovdqa (%r9), %xmm11 4721; AVX512BW-NEXT: vmovdqa (%r8), %xmm13 4722; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 4723; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 4724; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 4725; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm9 4726; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] 4727; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 4728; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] 4729; AVX512BW-NEXT: vpermw %ymm7, %ymm15, %ymm15 4730; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 4731; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 4732; AVX512BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 4733; AVX512BW-NEXT: kmovq %rcx, %k1 4734; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k1} 4735; AVX512BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 4736; AVX512BW-NEXT: kmovq %rcx, %k1 4737; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm0 {%k1} 4738; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm9 4739; AVX512BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero 4740; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 4741; AVX512BW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] 4742; AVX512BW-NEXT: vporq %zmm9, %zmm15, %zmm9 4743; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 4744; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 4745; AVX512BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] 4746; AVX512BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 4747; AVX512BW-NEXT: kmovd %ecx, %k1 4748; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] 4749; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27] 4750; AVX512BW-NEXT: vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero 4751; AVX512BW-NEXT: vporq %ymm16, %ymm17, %ymm16 4752; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 4753; AVX512BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] 4754; AVX512BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 4755; AVX512BW-NEXT: kmovq %rcx, %k2 4756; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} 4757; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 4758; AVX512BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 4759; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] 4760; AVX512BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero 4761; AVX512BW-NEXT: vporq %zmm16, %zmm17, %zmm16 4762; AVX512BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] 4763; AVX512BW-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 4764; AVX512BW-NEXT: kmovq %rcx, %k2 4765; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} 4766; AVX512BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C 4767; AVX512BW-NEXT: kmovq %rcx, %k2 4768; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm9 {%k2} 4769; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] 4770; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] 4771; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 4772; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 4773; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4774; AVX512BW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 4775; AVX512BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] 4776; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero 4777; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] 4778; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 4779; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 4780; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4781; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 4782; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 4783; AVX512BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C 4784; AVX512BW-NEXT: kmovq %rcx, %k2 4785; AVX512BW-NEXT: vmovdqu8 %zmm12, %zmm8 {%k2} 4786; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] 4787; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero 4788; AVX512BW-NEXT: vpor %xmm10, %xmm12, %xmm10 4789; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 4790; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4791; AVX512BW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 4792; AVX512BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] 4793; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 4794; AVX512BW-NEXT: vpermw %zmm7, %zmm11, %zmm11 4795; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 4796; AVX512BW-NEXT: kmovq %rcx, %k2 4797; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} 4798; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 4799; AVX512BW-NEXT: kmovq %rcx, %k2 4800; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm8 {%k2} 4801; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 4802; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] 4803; AVX512BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 4804; AVX512BW-NEXT: kmovd %ecx, %k2 4805; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] 4806; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3] 4807; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] 4808; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 4809; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] 4810; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} 4811; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] 4812; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 4813; AVX512BW-NEXT: kmovd %ecx, %k1 4814; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 4815; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] 4816; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 4817; AVX512BW-NEXT: vpermw %ymm7, %ymm2, %ymm2 4818; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero 4819; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] 4820; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3 4821; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 4822; AVX512BW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 4823; AVX512BW-NEXT: kmovd %ecx, %k1 4824; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} 4825; AVX512BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E 4826; AVX512BW-NEXT: kmovd %ecx, %k1 4827; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} 4828; AVX512BW-NEXT: vmovdqa %ymm1, 192(%rax) 4829; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) 4830; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) 4831; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 4832; AVX512BW-NEXT: vzeroupper 4833; AVX512BW-NEXT: retq 4834; 4835; AVX512BW-FCP-LABEL: store_i8_stride7_vf32: 4836; AVX512BW-FCP: # %bb.0: 4837; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4838; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 4839; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 4840; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 4841; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 4842; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 4843; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] 4844; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero 4845; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 4846; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 4847; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 4848; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] 4849; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4850; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 4851; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 4852; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero 4853; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] 4854; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 4855; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 4856; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm14 4857; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 4858; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 4859; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 4860; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 4861; AVX512BW-FCP-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 4862; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4863; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} 4864; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] 4865; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero 4866; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] 4867; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero 4868; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 4869; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm11 4870; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm13 4871; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 4872; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 4873; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] 4874; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 4875; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] 4876; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 4877; AVX512BW-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4878; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm15, %ymm15 4879; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 4880; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 4881; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 4882; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4883; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} 4884; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 4885; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4886; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} 4887; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm10 4888; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] 4889; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm15 4890; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero 4891; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm15, %zmm10 4892; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7] 4893; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm10 4894; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero 4895; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 4896; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] 4897; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm16, %zmm10 4898; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] 4899; AVX512BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 4900; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4901; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} 4902; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 4903; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 4904; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] 4905; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero 4906; AVX512BW-FCP-NEXT: vporq %zmm16, %zmm17, %zmm16 4907; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] 4908; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 4909; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4910; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm16 {%k1} 4911; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C 4912; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4913; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} 4914; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] 4915; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] 4916; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 4917; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 4918; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 4919; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 4920; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] 4921; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero 4922; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] 4923; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 4924; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 4925; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 4926; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 4927; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 4928; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C 4929; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4930; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} 4931; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] 4932; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero 4933; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 4934; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 4935; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 4936; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 4937; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] 4938; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 4939; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 4940; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 4941; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4942; AVX512BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm9 {%k1} 4943; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 4944; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 4945; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} 4946; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] 4947; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero 4948; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 4949; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 4950; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero 4951; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero 4952; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 4953; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 4954; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 4955; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 4956; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} 4957; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] 4958; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] 4959; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm2, %ymm2 4960; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero 4961; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] 4962; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 4963; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 4964; AVX512BW-FCP-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 4965; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 4966; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} 4967; AVX512BW-FCP-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E 4968; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 4969; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} 4970; AVX512BW-FCP-NEXT: vmovdqa %ymm1, 192(%rax) 4971; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) 4972; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 4973; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 4974; AVX512BW-FCP-NEXT: vzeroupper 4975; AVX512BW-FCP-NEXT: retq 4976; 4977; AVX512DQ-BW-LABEL: store_i8_stride7_vf32: 4978; AVX512DQ-BW: # %bb.0: 4979; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4980; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 4981; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 4982; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm2 4983; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 4984; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm3 4985; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] 4986; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero 4987; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm5, %ymm0 4988; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 4989; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 4990; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] 4991; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 4992; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 4993; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 4994; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero 4995; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] 4996; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm6, %ymm0 4997; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm12 4998; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm14 4999; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 5000; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 5001; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 5002; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 5003; AVX512DQ-BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 5004; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 5005; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} 5006; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] 5007; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero 5008; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] 5009; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero 5010; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm9, %ymm7 5011; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm11 5012; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm13 5013; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 5014; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 5015; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] 5016; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm9 5017; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] 5018; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 5019; AVX512DQ-BW-NEXT: # ymm15 = mem[0,1,0,1] 5020; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm15, %ymm15 5021; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 5022; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 5023; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 5024; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 5025; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k1} 5026; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 5027; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 5028; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm0 {%k1} 5029; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm9 5030; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero 5031; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 5032; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] 5033; AVX512DQ-BW-NEXT: vporq %zmm9, %zmm15, %zmm9 5034; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] 5035; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 5036; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] 5037; AVX512DQ-BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 5038; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 5039; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] 5040; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27] 5041; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero 5042; AVX512DQ-BW-NEXT: vporq %ymm16, %ymm17, %ymm16 5043; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 5044; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] 5045; AVX512DQ-BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 5046; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 5047; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} 5048; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 5049; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 5050; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] 5051; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero 5052; AVX512DQ-BW-NEXT: vporq %zmm16, %zmm17, %zmm16 5053; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] 5054; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 5055; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 5056; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} 5057; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C 5058; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 5059; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm9 {%k2} 5060; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] 5061; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] 5062; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 5063; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 5064; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 5065; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 5066; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] 5067; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero 5068; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] 5069; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 5070; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 5071; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 5072; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 5073; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 5074; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C 5075; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 5076; AVX512DQ-BW-NEXT: vmovdqu8 %zmm12, %zmm8 {%k2} 5077; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] 5078; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero 5079; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm12, %xmm10 5080; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 5081; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 5082; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 5083; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] 5084; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 5085; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm11, %zmm11 5086; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 5087; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 5088; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} 5089; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 5090; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 5091; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm8 {%k2} 5092; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 5093; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] 5094; AVX512DQ-BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 5095; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 5096; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] 5097; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3] 5098; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] 5099; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 5100; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] 5101; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} 5102; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] 5103; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 5104; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 5105; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} 5106; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] 5107; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] 5108; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm2, %ymm2 5109; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero 5110; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] 5111; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3 5112; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 5113; AVX512DQ-BW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 5114; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 5115; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} 5116; AVX512DQ-BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E 5117; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 5118; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} 5119; AVX512DQ-BW-NEXT: vmovdqa %ymm1, 192(%rax) 5120; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) 5121; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rax) 5122; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) 5123; AVX512DQ-BW-NEXT: vzeroupper 5124; AVX512DQ-BW-NEXT: retq 5125; 5126; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf32: 5127; AVX512DQ-BW-FCP: # %bb.0: 5128; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5129; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 5130; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 5131; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 5132; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 5133; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 5134; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] 5135; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero 5136; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 5137; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 5138; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 5139; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] 5140; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 5141; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 5142; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 5143; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero 5144; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] 5145; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 5146; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 5147; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm14 5148; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 5149; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 5150; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 5151; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 5152; AVX512DQ-BW-FCP-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 5153; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5154; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} 5155; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] 5156; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero 5157; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] 5158; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero 5159; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 5160; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm11 5161; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm13 5162; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 5163; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 5164; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] 5165; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 5166; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] 5167; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 5168; AVX512DQ-BW-FCP-NEXT: # ymm15 = mem[0,1,0,1] 5169; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm15, %ymm15 5170; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 5171; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 5172; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 5173; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5174; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} 5175; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 5176; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5177; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} 5178; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm10 5179; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] 5180; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm15 5181; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero 5182; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm15, %zmm10 5183; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7] 5184; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm10 5185; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero 5186; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 5187; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] 5188; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm16, %zmm10 5189; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] 5190; AVX512DQ-BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 5191; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5192; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} 5193; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 5194; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 5195; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] 5196; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero 5197; AVX512DQ-BW-FCP-NEXT: vporq %zmm16, %zmm17, %zmm16 5198; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] 5199; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 5200; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5201; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm16 {%k1} 5202; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C 5203; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5204; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} 5205; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] 5206; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] 5207; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 5208; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 5209; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 5210; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 5211; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] 5212; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero 5213; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] 5214; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 5215; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] 5216; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 5217; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 5218; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 5219; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C 5220; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5221; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} 5222; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] 5223; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero 5224; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 5225; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] 5226; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 5227; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 5228; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] 5229; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 5230; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 5231; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 5232; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5233; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm9 {%k1} 5234; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 5235; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 5236; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} 5237; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] 5238; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero 5239; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 5240; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 5241; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero 5242; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero 5243; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 5244; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 5245; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 5246; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 5247; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} 5248; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] 5249; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] 5250; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm2, %ymm2 5251; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero 5252; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] 5253; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 5254; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 5255; AVX512DQ-BW-FCP-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 5256; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 5257; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} 5258; AVX512DQ-BW-FCP-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E 5259; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 5260; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} 5261; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, 192(%rax) 5262; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) 5263; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) 5264; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) 5265; AVX512DQ-BW-FCP-NEXT: vzeroupper 5266; AVX512DQ-BW-FCP-NEXT: retq 5267 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 5268 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64 5269 %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64 5270 %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64 5271 %in.vec4 = load <32 x i8>, ptr %in.vecptr4, align 64 5272 %in.vec5 = load <32 x i8>, ptr %in.vecptr5, align 64 5273 %in.vec6 = load <32 x i8>, ptr %in.vecptr6, align 64 5274 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5275 %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5276 %3 = shufflevector <32 x i8> %in.vec4, <32 x i8> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5277 %4 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 5278 %5 = shufflevector <32 x i8> %in.vec6, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5279 %6 = shufflevector <64 x i8> %3, <64 x i8> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 5280 %7 = shufflevector <96 x i8> %6, <96 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 5281 %8 = shufflevector <128 x i8> %4, <128 x i8> %7, <224 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223> 5282 %interleaved.vec = shufflevector <224 x i8> %8, <224 x i8> poison, <224 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223> 5283 store <224 x i8> %interleaved.vec, ptr %out.vec, align 64 5284 ret void 5285} 5286 5287define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { 5288; SSE-LABEL: store_i8_stride7_vf64: 5289; SSE: # %bb.0: 5290; SSE-NEXT: subq $648, %rsp # imm = 0x288 5291; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 5292; SSE-NEXT: movdqa 48(%rdi), %xmm14 5293; SSE-NEXT: movdqa 48(%rsi), %xmm11 5294; SSE-NEXT: movdqa 48(%rdx), %xmm3 5295; SSE-NEXT: movdqa 48(%rcx), %xmm10 5296; SSE-NEXT: movdqa 48(%r8), %xmm9 5297; SSE-NEXT: movdqa 48(%r9), %xmm8 5298; SSE-NEXT: movdqa 48(%rax), %xmm13 5299; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] 5300; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5301; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 5302; SSE-NEXT: pand %xmm2, %xmm0 5303; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] 5304; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5305; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5306; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] 5307; SSE-NEXT: pandn %xmm1, %xmm2 5308; SSE-NEXT: por %xmm0, %xmm2 5309; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 5310; SSE-NEXT: pand %xmm12, %xmm2 5311; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] 5312; SSE-NEXT: movdqa %xmm3, %xmm6 5313; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5314; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 5315; SSE-NEXT: pand %xmm4, %xmm0 5316; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] 5317; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5318; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5319; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] 5320; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] 5321; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5322; SSE-NEXT: pandn %xmm3, %xmm4 5323; SSE-NEXT: por %xmm0, %xmm4 5324; SSE-NEXT: movdqa %xmm12, %xmm0 5325; SSE-NEXT: pandn %xmm4, %xmm0 5326; SSE-NEXT: por %xmm2, %xmm0 5327; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 5328; SSE-NEXT: pand %xmm1, %xmm0 5329; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] 5330; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] 5331; SSE-NEXT: movdqa %xmm1, %xmm3 5332; SSE-NEXT: movdqa %xmm1, %xmm5 5333; SSE-NEXT: pandn %xmm2, %xmm3 5334; SSE-NEXT: por %xmm0, %xmm3 5335; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] 5336; SSE-NEXT: pand %xmm7, %xmm3 5337; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] 5338; SSE-NEXT: movdqa %xmm8, %xmm1 5339; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5340; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5341; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] 5342; SSE-NEXT: movdqa %xmm7, %xmm4 5343; SSE-NEXT: pandn %xmm0, %xmm4 5344; SSE-NEXT: por %xmm3, %xmm4 5345; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] 5346; SSE-NEXT: pand %xmm2, %xmm4 5347; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] 5348; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5349; SSE-NEXT: movdqa %xmm2, %xmm3 5350; SSE-NEXT: pandn %xmm0, %xmm3 5351; SSE-NEXT: por %xmm4, %xmm3 5352; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5353; SSE-NEXT: movdqa %xmm11, %xmm0 5354; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] 5355; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] 5356; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 5357; SSE-NEXT: movdqa %xmm5, %xmm4 5358; SSE-NEXT: pandn %xmm3, %xmm5 5359; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6] 5360; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] 5361; SSE-NEXT: pand %xmm4, %xmm3 5362; SSE-NEXT: por %xmm3, %xmm5 5363; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 5364; SSE-NEXT: movdqa %xmm4, %xmm3 5365; SSE-NEXT: pandn %xmm5, %xmm3 5366; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6] 5367; SSE-NEXT: movdqa %xmm6, %xmm15 5368; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] 5369; SSE-NEXT: movdqa %xmm7, %xmm6 5370; SSE-NEXT: pandn %xmm5, %xmm6 5371; SSE-NEXT: movdqa %xmm10, %xmm5 5372; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] 5373; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,2,3] 5374; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,0,4,5,6,7] 5375; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 5376; SSE-NEXT: pand %xmm7, %xmm8 5377; SSE-NEXT: por %xmm6, %xmm8 5378; SSE-NEXT: pand %xmm4, %xmm8 5379; SSE-NEXT: por %xmm3, %xmm8 5380; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,6,7] 5381; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] 5382; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 5383; SSE-NEXT: movdqa %xmm4, %xmm6 5384; SSE-NEXT: pandn %xmm3, %xmm6 5385; SSE-NEXT: pand %xmm4, %xmm8 5386; SSE-NEXT: por %xmm8, %xmm6 5387; SSE-NEXT: movdqa %xmm1, %xmm3 5388; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 5389; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] 5390; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 5391; SSE-NEXT: movdqa %xmm4, %xmm11 5392; SSE-NEXT: pandn %xmm8, %xmm11 5393; SSE-NEXT: pand %xmm4, %xmm6 5394; SSE-NEXT: por %xmm6, %xmm11 5395; SSE-NEXT: movdqa %xmm13, %xmm10 5396; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6] 5397; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] 5398; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 5399; SSE-NEXT: movdqa %xmm1, %xmm8 5400; SSE-NEXT: pandn %xmm6, %xmm8 5401; SSE-NEXT: pand %xmm1, %xmm11 5402; SSE-NEXT: por %xmm11, %xmm8 5403; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5404; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] 5405; SSE-NEXT: movdqa %xmm4, %xmm8 5406; SSE-NEXT: pandn %xmm6, %xmm8 5407; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7] 5408; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5409; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] 5410; SSE-NEXT: pand %xmm4, %xmm6 5411; SSE-NEXT: por %xmm8, %xmm6 5412; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 5413; SSE-NEXT: movdqa %xmm1, %xmm8 5414; SSE-NEXT: pandn %xmm6, %xmm8 5415; SSE-NEXT: movdqa %xmm14, %xmm13 5416; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5417; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5] 5418; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 5419; SSE-NEXT: movdqa %xmm7, %xmm11 5420; SSE-NEXT: pandn %xmm6, %xmm11 5421; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,2,2,3,4,5,6,7] 5422; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 5423; SSE-NEXT: pand %xmm7, %xmm6 5424; SSE-NEXT: por %xmm11, %xmm6 5425; SSE-NEXT: pand %xmm1, %xmm6 5426; SSE-NEXT: por %xmm8, %xmm6 5427; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,4,6,5] 5428; SSE-NEXT: movdqa %xmm9, %xmm1 5429; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5430; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] 5431; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 5432; SSE-NEXT: movdqa %xmm4, %xmm11 5433; SSE-NEXT: pandn %xmm8, %xmm11 5434; SSE-NEXT: pand %xmm4, %xmm6 5435; SSE-NEXT: por %xmm6, %xmm11 5436; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,2,2,3,4,5,6,7] 5437; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] 5438; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 5439; SSE-NEXT: movdqa %xmm9, %xmm8 5440; SSE-NEXT: pandn %xmm6, %xmm8 5441; SSE-NEXT: pand %xmm9, %xmm11 5442; SSE-NEXT: movdqa %xmm9, %xmm14 5443; SSE-NEXT: por %xmm11, %xmm8 5444; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,7] 5445; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5446; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 5447; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 5448; SSE-NEXT: movdqa %xmm11, %xmm9 5449; SSE-NEXT: pandn %xmm6, %xmm9 5450; SSE-NEXT: pand %xmm11, %xmm8 5451; SSE-NEXT: por %xmm8, %xmm9 5452; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5453; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] 5454; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 5455; SSE-NEXT: movdqa %xmm11, %xmm6 5456; SSE-NEXT: pandn %xmm5, %xmm6 5457; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7] 5458; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] 5459; SSE-NEXT: pand %xmm11, %xmm5 5460; SSE-NEXT: por %xmm5, %xmm6 5461; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 5462; SSE-NEXT: movdqa %xmm8, %xmm5 5463; SSE-NEXT: pandn %xmm6, %xmm5 5464; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 5465; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 5466; SSE-NEXT: movdqa %xmm4, %xmm6 5467; SSE-NEXT: pandn %xmm0, %xmm6 5468; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] 5469; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 5470; SSE-NEXT: pand %xmm4, %xmm0 5471; SSE-NEXT: por %xmm0, %xmm6 5472; SSE-NEXT: pand %xmm8, %xmm6 5473; SSE-NEXT: por %xmm5, %xmm6 5474; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] 5475; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 5476; SSE-NEXT: movdqa %xmm7, %xmm5 5477; SSE-NEXT: pandn %xmm0, %xmm5 5478; SSE-NEXT: pand %xmm7, %xmm6 5479; SSE-NEXT: por %xmm6, %xmm5 5480; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,6,6,7] 5481; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] 5482; SSE-NEXT: movdqa %xmm2, %xmm3 5483; SSE-NEXT: pandn %xmm0, %xmm3 5484; SSE-NEXT: pand %xmm2, %xmm5 5485; SSE-NEXT: por %xmm5, %xmm3 5486; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,7,7,7] 5487; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 5488; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 5489; SSE-NEXT: movdqa %xmm5, %xmm1 5490; SSE-NEXT: pandn %xmm0, %xmm1 5491; SSE-NEXT: pand %xmm5, %xmm3 5492; SSE-NEXT: por %xmm3, %xmm1 5493; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5494; SSE-NEXT: movdqa (%rsi), %xmm0 5495; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5496; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5497; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5498; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] 5499; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 5500; SSE-NEXT: movdqa %xmm4, %xmm3 5501; SSE-NEXT: pandn %xmm0, %xmm3 5502; SSE-NEXT: movdqa (%rdi), %xmm0 5503; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5504; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5505; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5506; SSE-NEXT: pand %xmm4, %xmm0 5507; SSE-NEXT: movdqa %xmm4, %xmm11 5508; SSE-NEXT: por %xmm0, %xmm3 5509; SSE-NEXT: movdqa (%rcx), %xmm0 5510; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5511; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5512; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5513; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] 5514; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5515; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5516; SSE-NEXT: movdqa %xmm14, %xmm5 5517; SSE-NEXT: pandn %xmm0, %xmm5 5518; SSE-NEXT: movdqa (%rdx), %xmm0 5519; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5520; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5521; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5522; SSE-NEXT: pand %xmm14, %xmm0 5523; SSE-NEXT: movdqa %xmm14, %xmm9 5524; SSE-NEXT: por %xmm0, %xmm5 5525; SSE-NEXT: movdqa %xmm12, %xmm0 5526; SSE-NEXT: pandn %xmm5, %xmm0 5527; SSE-NEXT: pand %xmm12, %xmm3 5528; SSE-NEXT: por %xmm3, %xmm0 5529; SSE-NEXT: movdqa (%r9), %xmm15 5530; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] 5531; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5532; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5533; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] 5534; SSE-NEXT: movdqa %xmm7, %xmm6 5535; SSE-NEXT: pandn %xmm3, %xmm6 5536; SSE-NEXT: movdqa (%r8), %xmm8 5537; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] 5538; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5539; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5540; SSE-NEXT: pand %xmm7, %xmm3 5541; SSE-NEXT: por %xmm3, %xmm6 5542; SSE-NEXT: movdqa (%rax), %xmm4 5543; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] 5544; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5545; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5546; SSE-NEXT: movdqa %xmm2, %xmm14 5547; SSE-NEXT: pandn %xmm3, %xmm14 5548; SSE-NEXT: pand %xmm2, %xmm6 5549; SSE-NEXT: por %xmm6, %xmm14 5550; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 5551; SSE-NEXT: movdqa %xmm10, %xmm1 5552; SSE-NEXT: pandn %xmm14, %xmm1 5553; SSE-NEXT: pand %xmm10, %xmm0 5554; SSE-NEXT: por %xmm0, %xmm1 5555; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5556; SSE-NEXT: movdqa 16(%rsi), %xmm0 5557; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5558; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5559; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5560; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] 5561; SSE-NEXT: movdqa %xmm11, %xmm3 5562; SSE-NEXT: pandn %xmm0, %xmm3 5563; SSE-NEXT: movdqa 16(%rdi), %xmm0 5564; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5565; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5566; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5567; SSE-NEXT: pand %xmm11, %xmm0 5568; SSE-NEXT: por %xmm0, %xmm3 5569; SSE-NEXT: movdqa 16(%rcx), %xmm0 5570; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5571; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5572; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5573; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] 5574; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5575; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5576; SSE-NEXT: movdqa %xmm9, %xmm6 5577; SSE-NEXT: pandn %xmm0, %xmm6 5578; SSE-NEXT: movdqa 16(%rdx), %xmm0 5579; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5580; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 5581; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5582; SSE-NEXT: pand %xmm9, %xmm0 5583; SSE-NEXT: por %xmm0, %xmm6 5584; SSE-NEXT: movdqa %xmm12, %xmm0 5585; SSE-NEXT: pandn %xmm6, %xmm0 5586; SSE-NEXT: pand %xmm12, %xmm3 5587; SSE-NEXT: por %xmm3, %xmm0 5588; SSE-NEXT: movdqa 16(%r9), %xmm1 5589; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5590; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] 5591; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5592; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] 5593; SSE-NEXT: movdqa %xmm7, %xmm6 5594; SSE-NEXT: pandn %xmm3, %xmm6 5595; SSE-NEXT: movdqa 16(%r8), %xmm1 5596; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5597; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 5598; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5599; SSE-NEXT: pand %xmm7, %xmm3 5600; SSE-NEXT: por %xmm3, %xmm6 5601; SSE-NEXT: movdqa 16(%rax), %xmm1 5602; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5603; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 5604; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5605; SSE-NEXT: movdqa %xmm2, %xmm14 5606; SSE-NEXT: pandn %xmm3, %xmm14 5607; SSE-NEXT: pand %xmm2, %xmm6 5608; SSE-NEXT: por %xmm6, %xmm14 5609; SSE-NEXT: movdqa %xmm10, %xmm1 5610; SSE-NEXT: pandn %xmm14, %xmm1 5611; SSE-NEXT: pand %xmm10, %xmm0 5612; SSE-NEXT: por %xmm0, %xmm1 5613; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5614; SSE-NEXT: movdqa 32(%rsi), %xmm0 5615; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5616; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5617; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5618; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,0,3] 5619; SSE-NEXT: movdqa %xmm11, %xmm0 5620; SSE-NEXT: pandn %xmm3, %xmm0 5621; SSE-NEXT: movdqa 32(%rdi), %xmm1 5622; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 5623; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 5624; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5625; SSE-NEXT: pand %xmm11, %xmm3 5626; SSE-NEXT: por %xmm3, %xmm0 5627; SSE-NEXT: movdqa 32(%rcx), %xmm1 5628; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5629; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] 5630; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5631; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] 5632; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] 5633; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5634; SSE-NEXT: movdqa %xmm9, %xmm5 5635; SSE-NEXT: movdqa %xmm9, %xmm6 5636; SSE-NEXT: pandn %xmm3, %xmm6 5637; SSE-NEXT: movdqa 32(%rdx), %xmm9 5638; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,3,3,3,4,5,6,7] 5639; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5640; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 5641; SSE-NEXT: pand %xmm5, %xmm3 5642; SSE-NEXT: por %xmm3, %xmm6 5643; SSE-NEXT: pand %xmm12, %xmm0 5644; SSE-NEXT: pandn %xmm6, %xmm12 5645; SSE-NEXT: por %xmm0, %xmm12 5646; SSE-NEXT: movdqa 32(%r9), %xmm0 5647; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5648; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5649; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5650; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] 5651; SSE-NEXT: movdqa %xmm7, %xmm3 5652; SSE-NEXT: pandn %xmm0, %xmm3 5653; SSE-NEXT: movdqa 32(%r8), %xmm11 5654; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] 5655; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5656; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5657; SSE-NEXT: pand %xmm7, %xmm0 5658; SSE-NEXT: por %xmm0, %xmm3 5659; SSE-NEXT: pand %xmm2, %xmm3 5660; SSE-NEXT: movdqa 32(%rax), %xmm13 5661; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] 5662; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5663; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 5664; SSE-NEXT: pandn %xmm0, %xmm2 5665; SSE-NEXT: por %xmm3, %xmm2 5666; SSE-NEXT: pand %xmm10, %xmm12 5667; SSE-NEXT: pandn %xmm2, %xmm10 5668; SSE-NEXT: por %xmm12, %xmm10 5669; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5670; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5671; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5672; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5673; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] 5674; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5675; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 5676; SSE-NEXT: movdqa %xmm12, %xmm1 5677; SSE-NEXT: pandn %xmm0, %xmm1 5678; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5679; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,6,6,6] 5680; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 5681; SSE-NEXT: pand %xmm12, %xmm0 5682; SSE-NEXT: por %xmm0, %xmm1 5683; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 5684; SSE-NEXT: movdqa %xmm14, %xmm0 5685; SSE-NEXT: pandn %xmm1, %xmm0 5686; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5687; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] 5688; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 5689; SSE-NEXT: movdqa %xmm7, %xmm2 5690; SSE-NEXT: pandn %xmm1, %xmm2 5691; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5692; SSE-NEXT: movdqa %xmm5, %xmm1 5693; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 5694; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5695; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 5696; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 5697; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5698; SSE-NEXT: pand %xmm7, %xmm1 5699; SSE-NEXT: por %xmm2, %xmm1 5700; SSE-NEXT: pand %xmm14, %xmm1 5701; SSE-NEXT: por %xmm0, %xmm1 5702; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5703; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5704; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] 5705; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 5706; SSE-NEXT: movdqa %xmm0, %xmm3 5707; SSE-NEXT: pandn %xmm2, %xmm3 5708; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7] 5709; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 5710; SSE-NEXT: pand %xmm0, %xmm2 5711; SSE-NEXT: por %xmm3, %xmm2 5712; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,6,6] 5713; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5714; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 5715; SSE-NEXT: movdqa %xmm4, %xmm15 5716; SSE-NEXT: pandn %xmm3, %xmm15 5717; SSE-NEXT: pand %xmm4, %xmm2 5718; SSE-NEXT: por %xmm2, %xmm15 5719; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 5720; SSE-NEXT: movdqa %xmm10, %xmm0 5721; SSE-NEXT: pandn %xmm15, %xmm0 5722; SSE-NEXT: pand %xmm10, %xmm1 5723; SSE-NEXT: por %xmm1, %xmm0 5724; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5725; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5726; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5727; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5728; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] 5729; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5730; SSE-NEXT: movdqa %xmm12, %xmm8 5731; SSE-NEXT: movdqa %xmm12, %xmm2 5732; SSE-NEXT: pandn %xmm1, %xmm2 5733; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5734; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] 5735; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 5736; SSE-NEXT: pand %xmm12, %xmm1 5737; SSE-NEXT: por %xmm1, %xmm2 5738; SSE-NEXT: movdqa %xmm14, %xmm12 5739; SSE-NEXT: movdqa %xmm14, %xmm3 5740; SSE-NEXT: pandn %xmm2, %xmm3 5741; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5742; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] 5743; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 5744; SSE-NEXT: movdqa %xmm7, %xmm2 5745; SSE-NEXT: pandn %xmm1, %xmm2 5746; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5747; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5748; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5749; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] 5750; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 5751; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5752; SSE-NEXT: pand %xmm7, %xmm1 5753; SSE-NEXT: por %xmm2, %xmm1 5754; SSE-NEXT: pand %xmm14, %xmm1 5755; SSE-NEXT: por %xmm3, %xmm1 5756; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5757; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5758; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5759; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 5760; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 5761; SSE-NEXT: movdqa %xmm14, %xmm3 5762; SSE-NEXT: pandn %xmm2, %xmm3 5763; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5764; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] 5765; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 5766; SSE-NEXT: pand %xmm14, %xmm2 5767; SSE-NEXT: por %xmm3, %xmm2 5768; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5769; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,6,6] 5770; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5771; SSE-NEXT: movdqa %xmm4, %xmm15 5772; SSE-NEXT: pandn %xmm3, %xmm15 5773; SSE-NEXT: pand %xmm4, %xmm2 5774; SSE-NEXT: por %xmm2, %xmm15 5775; SSE-NEXT: movdqa %xmm10, %xmm0 5776; SSE-NEXT: pandn %xmm15, %xmm0 5777; SSE-NEXT: pand %xmm10, %xmm1 5778; SSE-NEXT: por %xmm1, %xmm0 5779; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5780; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5781; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5782; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5783; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] 5784; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5785; SSE-NEXT: movdqa %xmm8, %xmm2 5786; SSE-NEXT: pandn %xmm1, %xmm2 5787; SSE-NEXT: pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload 5788; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] 5789; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 5790; SSE-NEXT: pand %xmm8, %xmm1 5791; SSE-NEXT: por %xmm1, %xmm2 5792; SSE-NEXT: movdqa %xmm12, %xmm3 5793; SSE-NEXT: pandn %xmm2, %xmm3 5794; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6] 5795; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 5796; SSE-NEXT: movdqa %xmm7, %xmm2 5797; SSE-NEXT: pandn %xmm1, %xmm2 5798; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5799; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5800; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5801; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] 5802; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] 5803; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5804; SSE-NEXT: pand %xmm7, %xmm1 5805; SSE-NEXT: por %xmm2, %xmm1 5806; SSE-NEXT: pand %xmm12, %xmm1 5807; SSE-NEXT: por %xmm3, %xmm1 5808; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5809; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5810; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5811; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 5812; SSE-NEXT: movdqa %xmm14, %xmm3 5813; SSE-NEXT: pandn %xmm2, %xmm3 5814; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,6,6,7] 5815; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 5816; SSE-NEXT: pand %xmm14, %xmm2 5817; SSE-NEXT: por %xmm3, %xmm2 5818; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,6,6] 5819; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] 5820; SSE-NEXT: movdqa %xmm4, %xmm15 5821; SSE-NEXT: pandn %xmm3, %xmm15 5822; SSE-NEXT: pand %xmm4, %xmm2 5823; SSE-NEXT: por %xmm2, %xmm15 5824; SSE-NEXT: pand %xmm10, %xmm1 5825; SSE-NEXT: pandn %xmm15, %xmm10 5826; SSE-NEXT: por %xmm1, %xmm10 5827; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5828; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5829; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,1,4,5,6,7] 5830; SSE-NEXT: movdqa %xmm5, %xmm14 5831; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 5832; SSE-NEXT: movdqa %xmm4, %xmm2 5833; SSE-NEXT: pandn %xmm1, %xmm2 5834; SSE-NEXT: movdqa %xmm6, %xmm8 5835; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] 5836; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 5837; SSE-NEXT: pand %xmm4, %xmm1 5838; SSE-NEXT: por %xmm1, %xmm2 5839; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 5840; SSE-NEXT: movdqa %xmm0, %xmm15 5841; SSE-NEXT: pandn %xmm2, %xmm15 5842; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5843; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5844; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,1,3,4,5,6,7] 5845; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,0] 5846; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 5847; SSE-NEXT: movdqa %xmm9, %xmm1 5848; SSE-NEXT: pandn %xmm2, %xmm1 5849; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5850; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] 5851; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 5852; SSE-NEXT: pand %xmm9, %xmm2 5853; SSE-NEXT: movdqa %xmm9, %xmm6 5854; SSE-NEXT: por %xmm2, %xmm1 5855; SSE-NEXT: pand %xmm0, %xmm1 5856; SSE-NEXT: por %xmm15, %xmm1 5857; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5858; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5859; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] 5860; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 5861; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 5862; SSE-NEXT: movdqa %xmm0, %xmm15 5863; SSE-NEXT: pandn %xmm2, %xmm15 5864; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5865; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] 5866; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 5867; SSE-NEXT: pand %xmm0, %xmm2 5868; SSE-NEXT: por %xmm2, %xmm15 5869; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5870; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7] 5871; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 5872; SSE-NEXT: movdqa %xmm7, %xmm10 5873; SSE-NEXT: pandn %xmm2, %xmm10 5874; SSE-NEXT: pand %xmm7, %xmm15 5875; SSE-NEXT: por %xmm15, %xmm10 5876; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 5877; SSE-NEXT: movdqa %xmm2, %xmm0 5878; SSE-NEXT: pandn %xmm10, %xmm0 5879; SSE-NEXT: pand %xmm2, %xmm1 5880; SSE-NEXT: por %xmm1, %xmm0 5881; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5882; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,2,2] 5883; SSE-NEXT: movdqa %xmm7, %xmm2 5884; SSE-NEXT: pandn %xmm1, %xmm2 5885; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,1,2,3,4,5,6,7] 5886; SSE-NEXT: movdqa %xmm3, %xmm9 5887; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,1] 5888; SSE-NEXT: pand %xmm7, %xmm10 5889; SSE-NEXT: por %xmm2, %xmm10 5890; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 5891; SSE-NEXT: movdqa %xmm0, %xmm15 5892; SSE-NEXT: movdqa %xmm0, %xmm12 5893; SSE-NEXT: pandn %xmm10, %xmm15 5894; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1] 5895; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4] 5896; SSE-NEXT: movdqa %xmm6, %xmm0 5897; SSE-NEXT: movdqa %xmm6, %xmm2 5898; SSE-NEXT: pandn %xmm10, %xmm2 5899; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7] 5900; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 5901; SSE-NEXT: pand %xmm0, %xmm10 5902; SSE-NEXT: por %xmm10, %xmm2 5903; SSE-NEXT: pand %xmm12, %xmm2 5904; SSE-NEXT: por %xmm15, %xmm2 5905; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] 5906; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] 5907; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 5908; SSE-NEXT: movdqa %xmm0, %xmm15 5909; SSE-NEXT: pandn %xmm10, %xmm15 5910; SSE-NEXT: movdqa %xmm13, %xmm3 5911; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] 5912; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] 5913; SSE-NEXT: pand %xmm0, %xmm10 5914; SSE-NEXT: por %xmm10, %xmm15 5915; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] 5916; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] 5917; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 5918; SSE-NEXT: movdqa %xmm13, %xmm0 5919; SSE-NEXT: pandn %xmm10, %xmm0 5920; SSE-NEXT: pand %xmm13, %xmm15 5921; SSE-NEXT: por %xmm15, %xmm0 5922; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 5923; SSE-NEXT: movdqa %xmm10, %xmm1 5924; SSE-NEXT: pandn %xmm0, %xmm1 5925; SSE-NEXT: pand %xmm10, %xmm2 5926; SSE-NEXT: por %xmm2, %xmm1 5927; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5928; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] 5929; SSE-NEXT: movdqa %xmm7, %xmm2 5930; SSE-NEXT: pandn %xmm0, %xmm2 5931; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] 5932; SSE-NEXT: movdqa %xmm8, %xmm10 5933; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 5934; SSE-NEXT: pand %xmm7, %xmm0 5935; SSE-NEXT: por %xmm2, %xmm0 5936; SSE-NEXT: movdqa %xmm12, %xmm2 5937; SSE-NEXT: pandn %xmm0, %xmm2 5938; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] 5939; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 5940; SSE-NEXT: movdqa %xmm13, %xmm15 5941; SSE-NEXT: movdqa %xmm13, %xmm8 5942; SSE-NEXT: pandn %xmm0, %xmm8 5943; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] 5944; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 5945; SSE-NEXT: pand %xmm13, %xmm0 5946; SSE-NEXT: por %xmm0, %xmm8 5947; SSE-NEXT: pand %xmm12, %xmm8 5948; SSE-NEXT: por %xmm2, %xmm8 5949; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 5950; SSE-NEXT: movdqa %xmm1, %xmm0 5951; SSE-NEXT: movdqa %xmm1, %xmm6 5952; SSE-NEXT: pandn %xmm8, %xmm0 5953; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7] 5954; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 5955; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 5956; SSE-NEXT: movdqa %xmm1, %xmm5 5957; SSE-NEXT: pandn %xmm2, %xmm5 5958; SSE-NEXT: movdqa %xmm3, %xmm4 5959; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,3,4,5,6,7] 5960; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 5961; SSE-NEXT: pand %xmm1, %xmm2 5962; SSE-NEXT: por %xmm2, %xmm5 5963; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] 5964; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 5965; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 5966; SSE-NEXT: movdqa %xmm1, %xmm8 5967; SSE-NEXT: pandn %xmm2, %xmm8 5968; SSE-NEXT: pand %xmm1, %xmm5 5969; SSE-NEXT: por %xmm5, %xmm8 5970; SSE-NEXT: pand %xmm6, %xmm8 5971; SSE-NEXT: por %xmm0, %xmm8 5972; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5973; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5974; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] 5975; SSE-NEXT: movdqa %xmm1, %xmm2 5976; SSE-NEXT: pandn %xmm0, %xmm2 5977; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] 5978; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 5979; SSE-NEXT: pand %xmm1, %xmm0 5980; SSE-NEXT: por %xmm2, %xmm0 5981; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 5982; SSE-NEXT: movdqa %xmm1, %xmm5 5983; SSE-NEXT: pandn %xmm0, %xmm5 5984; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5] 5985; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 5986; SSE-NEXT: movdqa %xmm7, %xmm8 5987; SSE-NEXT: pandn %xmm0, %xmm8 5988; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5989; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,2,2,3,4,5,6,7] 5990; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] 5991; SSE-NEXT: pand %xmm7, %xmm2 5992; SSE-NEXT: por %xmm8, %xmm2 5993; SSE-NEXT: pand %xmm1, %xmm2 5994; SSE-NEXT: por %xmm5, %xmm2 5995; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5996; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7] 5997; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 5998; SSE-NEXT: movdqa %xmm15, %xmm5 5999; SSE-NEXT: pandn %xmm0, %xmm5 6000; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] 6001; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 6002; SSE-NEXT: pand %xmm15, %xmm0 6003; SSE-NEXT: por %xmm0, %xmm5 6004; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] 6005; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 6006; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 6007; SSE-NEXT: movdqa %xmm12, %xmm8 6008; SSE-NEXT: pandn %xmm0, %xmm8 6009; SSE-NEXT: pand %xmm12, %xmm5 6010; SSE-NEXT: por %xmm5, %xmm8 6011; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 6012; SSE-NEXT: movdqa %xmm3, %xmm0 6013; SSE-NEXT: pandn %xmm8, %xmm0 6014; SSE-NEXT: pand %xmm3, %xmm2 6015; SSE-NEXT: por %xmm2, %xmm0 6016; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6017; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7] 6018; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 6019; SSE-NEXT: movdqa %xmm12, %xmm2 6020; SSE-NEXT: pandn %xmm0, %xmm2 6021; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] 6022; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6023; SSE-NEXT: pand %xmm12, %xmm0 6024; SSE-NEXT: por %xmm0, %xmm2 6025; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 6026; SSE-NEXT: movdqa %xmm5, %xmm0 6027; SSE-NEXT: pandn %xmm2, %xmm0 6028; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,6,6,7] 6029; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,3,3] 6030; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 6031; SSE-NEXT: movdqa %xmm3, %xmm2 6032; SSE-NEXT: pandn %xmm6, %xmm2 6033; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7] 6034; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 6035; SSE-NEXT: pand %xmm3, %xmm6 6036; SSE-NEXT: movdqa %xmm3, %xmm10 6037; SSE-NEXT: por %xmm6, %xmm2 6038; SSE-NEXT: pand %xmm5, %xmm2 6039; SSE-NEXT: por %xmm0, %xmm2 6040; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] 6041; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6042; SSE-NEXT: movdqa %xmm7, %xmm6 6043; SSE-NEXT: pandn %xmm0, %xmm6 6044; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] 6045; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] 6046; SSE-NEXT: pand %xmm7, %xmm0 6047; SSE-NEXT: por %xmm6, %xmm0 6048; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,7,7,7] 6049; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] 6050; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 6051; SSE-NEXT: movdqa %xmm1, %xmm8 6052; SSE-NEXT: pandn %xmm6, %xmm8 6053; SSE-NEXT: pand %xmm1, %xmm0 6054; SSE-NEXT: movdqa %xmm1, %xmm4 6055; SSE-NEXT: por %xmm0, %xmm8 6056; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 6057; SSE-NEXT: movdqa %xmm1, %xmm0 6058; SSE-NEXT: pandn %xmm8, %xmm0 6059; SSE-NEXT: pand %xmm1, %xmm2 6060; SSE-NEXT: por %xmm2, %xmm0 6061; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6062; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6063; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6064; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] 6065; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 6066; SSE-NEXT: movdqa %xmm3, %xmm2 6067; SSE-NEXT: pandn %xmm0, %xmm2 6068; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6069; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] 6070; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6071; SSE-NEXT: pand %xmm3, %xmm0 6072; SSE-NEXT: por %xmm0, %xmm2 6073; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 6074; SSE-NEXT: movdqa %xmm3, %xmm6 6075; SSE-NEXT: pandn %xmm2, %xmm6 6076; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6077; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6078; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] 6079; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,0] 6080; SSE-NEXT: movdqa %xmm4, %xmm0 6081; SSE-NEXT: pandn %xmm2, %xmm0 6082; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6083; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] 6084; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 6085; SSE-NEXT: pand %xmm4, %xmm2 6086; SSE-NEXT: por %xmm2, %xmm0 6087; SSE-NEXT: pand %xmm3, %xmm0 6088; SSE-NEXT: por %xmm6, %xmm0 6089; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6090; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6091; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] 6092; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 6093; SSE-NEXT: movdqa %xmm12, %xmm6 6094; SSE-NEXT: pandn %xmm2, %xmm6 6095; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6096; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] 6097; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 6098; SSE-NEXT: pand %xmm12, %xmm2 6099; SSE-NEXT: por %xmm2, %xmm6 6100; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6101; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] 6102; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 6103; SSE-NEXT: movdqa %xmm7, %xmm8 6104; SSE-NEXT: pandn %xmm2, %xmm8 6105; SSE-NEXT: pand %xmm7, %xmm6 6106; SSE-NEXT: por %xmm6, %xmm8 6107; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 6108; SSE-NEXT: movdqa %xmm2, %xmm6 6109; SSE-NEXT: pandn %xmm8, %xmm6 6110; SSE-NEXT: pand %xmm2, %xmm0 6111; SSE-NEXT: por %xmm0, %xmm6 6112; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6113; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] 6114; SSE-NEXT: movdqa %xmm7, %xmm2 6115; SSE-NEXT: pandn %xmm0, %xmm2 6116; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7] 6117; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6118; SSE-NEXT: pand %xmm7, %xmm0 6119; SSE-NEXT: por %xmm2, %xmm0 6120; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 6121; SSE-NEXT: movdqa %xmm12, %xmm2 6122; SSE-NEXT: pandn %xmm0, %xmm2 6123; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] 6124; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,7,5,6,4] 6125; SSE-NEXT: movdqa %xmm4, %xmm0 6126; SSE-NEXT: pandn %xmm6, %xmm0 6127; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[1,1,2,2,4,5,6,7] 6128; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] 6129; SSE-NEXT: pand %xmm4, %xmm6 6130; SSE-NEXT: por %xmm6, %xmm0 6131; SSE-NEXT: pand %xmm12, %xmm0 6132; SSE-NEXT: por %xmm2, %xmm0 6133; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] 6134; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 6135; SSE-NEXT: movdqa %xmm10, %xmm6 6136; SSE-NEXT: pandn %xmm2, %xmm6 6137; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] 6138; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 6139; SSE-NEXT: pand %xmm10, %xmm2 6140; SSE-NEXT: por %xmm2, %xmm6 6141; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7] 6142; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 6143; SSE-NEXT: movdqa %xmm15, %xmm8 6144; SSE-NEXT: pandn %xmm2, %xmm8 6145; SSE-NEXT: pand %xmm15, %xmm6 6146; SSE-NEXT: movdqa %xmm15, %xmm10 6147; SSE-NEXT: por %xmm6, %xmm8 6148; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 6149; SSE-NEXT: movdqa %xmm6, %xmm2 6150; SSE-NEXT: pandn %xmm8, %xmm2 6151; SSE-NEXT: pand %xmm6, %xmm0 6152; SSE-NEXT: por %xmm0, %xmm2 6153; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6154; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 6155; SSE-NEXT: movdqa %xmm7, %xmm2 6156; SSE-NEXT: pandn %xmm0, %xmm2 6157; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] 6158; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6159; SSE-NEXT: pand %xmm7, %xmm0 6160; SSE-NEXT: por %xmm2, %xmm0 6161; SSE-NEXT: movdqa %xmm12, %xmm2 6162; SSE-NEXT: pandn %xmm0, %xmm2 6163; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] 6164; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6165; SSE-NEXT: movdqa %xmm15, %xmm6 6166; SSE-NEXT: pandn %xmm0, %xmm6 6167; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 6168; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 6169; SSE-NEXT: pand %xmm15, %xmm0 6170; SSE-NEXT: por %xmm0, %xmm6 6171; SSE-NEXT: pand %xmm12, %xmm6 6172; SSE-NEXT: por %xmm2, %xmm6 6173; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 6174; SSE-NEXT: movdqa %xmm9, %xmm0 6175; SSE-NEXT: pandn %xmm6, %xmm0 6176; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,5,7] 6177; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] 6178; SSE-NEXT: movdqa %xmm4, %xmm12 6179; SSE-NEXT: movdqa %xmm4, %xmm6 6180; SSE-NEXT: pandn %xmm2, %xmm6 6181; SSE-NEXT: movdqa %xmm13, %xmm5 6182; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,2,2,3,4,5,6,7] 6183; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] 6184; SSE-NEXT: pand %xmm4, %xmm2 6185; SSE-NEXT: por %xmm2, %xmm6 6186; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] 6187; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0] 6188; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 6189; SSE-NEXT: movdqa %xmm2, %xmm4 6190; SSE-NEXT: pandn %xmm8, %xmm4 6191; SSE-NEXT: pand %xmm2, %xmm6 6192; SSE-NEXT: por %xmm6, %xmm4 6193; SSE-NEXT: pand %xmm9, %xmm4 6194; SSE-NEXT: por %xmm0, %xmm4 6195; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6196; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6197; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,1,3] 6198; SSE-NEXT: movdqa %xmm2, %xmm6 6199; SSE-NEXT: pandn %xmm0, %xmm6 6200; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] 6201; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 6202; SSE-NEXT: pand %xmm2, %xmm0 6203; SSE-NEXT: por %xmm6, %xmm0 6204; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 6205; SSE-NEXT: movdqa %xmm2, %xmm6 6206; SSE-NEXT: pandn %xmm0, %xmm6 6207; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5] 6208; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6209; SSE-NEXT: movdqa %xmm7, %xmm8 6210; SSE-NEXT: pandn %xmm0, %xmm8 6211; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6212; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] 6213; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6214; SSE-NEXT: pand %xmm7, %xmm0 6215; SSE-NEXT: por %xmm8, %xmm0 6216; SSE-NEXT: pand %xmm2, %xmm0 6217; SSE-NEXT: por %xmm6, %xmm0 6218; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6219; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[1,2,2,3,4,5,6,7] 6220; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] 6221; SSE-NEXT: movdqa %xmm10, %xmm8 6222; SSE-NEXT: pandn %xmm6, %xmm8 6223; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] 6224; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] 6225; SSE-NEXT: pand %xmm10, %xmm6 6226; SSE-NEXT: por %xmm6, %xmm8 6227; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] 6228; SSE-NEXT: movdqa %xmm14, %xmm2 6229; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 6230; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 6231; SSE-NEXT: movdqa %xmm3, %xmm10 6232; SSE-NEXT: pandn %xmm6, %xmm10 6233; SSE-NEXT: pand %xmm3, %xmm8 6234; SSE-NEXT: movdqa %xmm3, %xmm6 6235; SSE-NEXT: por %xmm8, %xmm10 6236; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 6237; SSE-NEXT: movdqa %xmm3, %xmm4 6238; SSE-NEXT: pandn %xmm10, %xmm4 6239; SSE-NEXT: pand %xmm3, %xmm0 6240; SSE-NEXT: por %xmm0, %xmm4 6241; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6242; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7] 6243; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 6244; SSE-NEXT: movdqa %xmm6, %xmm3 6245; SSE-NEXT: pandn %xmm0, %xmm6 6246; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] 6247; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6248; SSE-NEXT: pand %xmm3, %xmm0 6249; SSE-NEXT: movdqa %xmm3, %xmm14 6250; SSE-NEXT: por %xmm0, %xmm6 6251; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 6252; SSE-NEXT: movdqa %xmm3, %xmm8 6253; SSE-NEXT: pandn %xmm6, %xmm8 6254; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,6,7] 6255; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] 6256; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 6257; SSE-NEXT: movdqa %xmm4, %xmm0 6258; SSE-NEXT: pandn %xmm6, %xmm0 6259; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7] 6260; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 6261; SSE-NEXT: pand %xmm4, %xmm6 6262; SSE-NEXT: movdqa %xmm4, %xmm11 6263; SSE-NEXT: por %xmm6, %xmm0 6264; SSE-NEXT: pand %xmm3, %xmm0 6265; SSE-NEXT: por %xmm8, %xmm0 6266; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7] 6267; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 6268; SSE-NEXT: movdqa %xmm7, %xmm8 6269; SSE-NEXT: pandn %xmm6, %xmm8 6270; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7] 6271; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] 6272; SSE-NEXT: pand %xmm7, %xmm6 6273; SSE-NEXT: por %xmm8, %xmm6 6274; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,7,7,7] 6275; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,2] 6276; SSE-NEXT: movdqa %xmm12, %xmm10 6277; SSE-NEXT: pandn %xmm8, %xmm10 6278; SSE-NEXT: pand %xmm12, %xmm6 6279; SSE-NEXT: movdqa %xmm12, %xmm4 6280; SSE-NEXT: por %xmm6, %xmm10 6281; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 6282; SSE-NEXT: movdqa %xmm1, %xmm2 6283; SSE-NEXT: pandn %xmm10, %xmm2 6284; SSE-NEXT: pand %xmm1, %xmm0 6285; SSE-NEXT: por %xmm0, %xmm2 6286; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6287; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6288; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6289; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] 6290; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 6291; SSE-NEXT: movdqa %xmm11, %xmm1 6292; SSE-NEXT: movdqa %xmm11, %xmm8 6293; SSE-NEXT: pandn %xmm0, %xmm8 6294; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6295; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] 6296; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6297; SSE-NEXT: pand %xmm1, %xmm0 6298; SSE-NEXT: por %xmm0, %xmm8 6299; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 6300; SSE-NEXT: movdqa %xmm3, %xmm10 6301; SSE-NEXT: pandn %xmm8, %xmm10 6302; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6303; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6304; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] 6305; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,1,0] 6306; SSE-NEXT: movdqa %xmm12, %xmm0 6307; SSE-NEXT: pandn %xmm8, %xmm0 6308; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload 6309; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] 6310; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] 6311; SSE-NEXT: pand %xmm12, %xmm8 6312; SSE-NEXT: por %xmm8, %xmm0 6313; SSE-NEXT: pand %xmm3, %xmm0 6314; SSE-NEXT: por %xmm10, %xmm0 6315; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6316; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6317; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,1,1,3,4,5,6,7] 6318; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] 6319; SSE-NEXT: movdqa %xmm14, %xmm10 6320; SSE-NEXT: pandn %xmm8, %xmm10 6321; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6322; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7] 6323; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] 6324; SSE-NEXT: pand %xmm14, %xmm8 6325; SSE-NEXT: por %xmm8, %xmm10 6326; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6327; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] 6328; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] 6329; SSE-NEXT: movdqa %xmm7, %xmm15 6330; SSE-NEXT: pandn %xmm8, %xmm15 6331; SSE-NEXT: pand %xmm7, %xmm10 6332; SSE-NEXT: por %xmm10, %xmm15 6333; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 6334; SSE-NEXT: movdqa %xmm8, %xmm10 6335; SSE-NEXT: pandn %xmm15, %xmm10 6336; SSE-NEXT: pand %xmm8, %xmm0 6337; SSE-NEXT: por %xmm0, %xmm10 6338; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6339; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] 6340; SSE-NEXT: movdqa %xmm7, %xmm8 6341; SSE-NEXT: pandn %xmm0, %xmm8 6342; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] 6343; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6344; SSE-NEXT: pand %xmm7, %xmm0 6345; SSE-NEXT: por %xmm8, %xmm0 6346; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 6347; SSE-NEXT: movdqa %xmm12, %xmm8 6348; SSE-NEXT: pandn %xmm0, %xmm8 6349; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] 6350; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4] 6351; SSE-NEXT: movdqa %xmm4, %xmm14 6352; SSE-NEXT: movdqa %xmm4, %xmm0 6353; SSE-NEXT: pandn %xmm10, %xmm0 6354; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7] 6355; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 6356; SSE-NEXT: pand %xmm4, %xmm10 6357; SSE-NEXT: por %xmm10, %xmm0 6358; SSE-NEXT: pand %xmm12, %xmm0 6359; SSE-NEXT: movdqa %xmm12, %xmm4 6360; SSE-NEXT: por %xmm8, %xmm0 6361; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] 6362; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] 6363; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 6364; SSE-NEXT: movdqa %xmm2, %xmm10 6365; SSE-NEXT: pandn %xmm8, %xmm10 6366; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7] 6367; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] 6368; SSE-NEXT: pand %xmm2, %xmm8 6369; SSE-NEXT: por %xmm8, %xmm10 6370; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7] 6371; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] 6372; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 6373; SSE-NEXT: movdqa %xmm2, %xmm15 6374; SSE-NEXT: pandn %xmm8, %xmm15 6375; SSE-NEXT: pand %xmm2, %xmm10 6376; SSE-NEXT: movdqa %xmm2, %xmm12 6377; SSE-NEXT: por %xmm10, %xmm15 6378; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 6379; SSE-NEXT: movdqa %xmm2, %xmm8 6380; SSE-NEXT: pandn %xmm15, %xmm8 6381; SSE-NEXT: pand %xmm2, %xmm0 6382; SSE-NEXT: por %xmm0, %xmm8 6383; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6384; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] 6385; SSE-NEXT: movdqa %xmm7, %xmm10 6386; SSE-NEXT: pandn %xmm0, %xmm10 6387; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] 6388; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6389; SSE-NEXT: pand %xmm7, %xmm0 6390; SSE-NEXT: por %xmm10, %xmm0 6391; SSE-NEXT: movdqa %xmm4, %xmm10 6392; SSE-NEXT: pandn %xmm0, %xmm10 6393; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] 6394; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6395; SSE-NEXT: movdqa %xmm12, %xmm15 6396; SSE-NEXT: pandn %xmm0, %xmm15 6397; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] 6398; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 6399; SSE-NEXT: pand %xmm12, %xmm0 6400; SSE-NEXT: por %xmm0, %xmm15 6401; SSE-NEXT: pand %xmm4, %xmm15 6402; SSE-NEXT: por %xmm10, %xmm15 6403; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 6404; SSE-NEXT: movdqa %xmm2, %xmm10 6405; SSE-NEXT: pandn %xmm15, %xmm10 6406; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,5,7] 6407; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 6408; SSE-NEXT: movdqa %xmm14, %xmm15 6409; SSE-NEXT: pandn %xmm0, %xmm15 6410; SSE-NEXT: movdqa %xmm1, %xmm8 6411; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,3,4,5,6,7] 6412; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6413; SSE-NEXT: pand %xmm14, %xmm0 6414; SSE-NEXT: movdqa %xmm14, %xmm4 6415; SSE-NEXT: por %xmm0, %xmm15 6416; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7] 6417; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6418; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 6419; SSE-NEXT: movdqa %xmm1, %xmm9 6420; SSE-NEXT: pandn %xmm0, %xmm9 6421; SSE-NEXT: pand %xmm1, %xmm15 6422; SSE-NEXT: por %xmm15, %xmm9 6423; SSE-NEXT: pand %xmm2, %xmm9 6424; SSE-NEXT: por %xmm10, %xmm9 6425; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6426; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] 6427; SSE-NEXT: movdqa %xmm1, %xmm10 6428; SSE-NEXT: pandn %xmm0, %xmm10 6429; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] 6430; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] 6431; SSE-NEXT: pand %xmm1, %xmm0 6432; SSE-NEXT: por %xmm10, %xmm0 6433; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5] 6434; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] 6435; SSE-NEXT: movdqa %xmm7, %xmm15 6436; SSE-NEXT: pandn %xmm10, %xmm15 6437; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6438; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,2,2,3,4,5,6,7] 6439; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 6440; SSE-NEXT: pand %xmm7, %xmm10 6441; SSE-NEXT: por %xmm15, %xmm10 6442; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 6443; SSE-NEXT: pand %xmm1, %xmm10 6444; SSE-NEXT: pandn %xmm0, %xmm1 6445; SSE-NEXT: por %xmm10, %xmm1 6446; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6447; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,2,2,3,4,5,6,7] 6448; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6449; SSE-NEXT: movdqa %xmm12, %xmm10 6450; SSE-NEXT: pandn %xmm0, %xmm10 6451; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] 6452; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 6453; SSE-NEXT: pand %xmm12, %xmm0 6454; SSE-NEXT: por %xmm0, %xmm10 6455; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] 6456; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 6457; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 6458; SSE-NEXT: movdqa %xmm14, %xmm15 6459; SSE-NEXT: pandn %xmm0, %xmm15 6460; SSE-NEXT: pand %xmm14, %xmm10 6461; SSE-NEXT: por %xmm10, %xmm15 6462; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 6463; SSE-NEXT: pand %xmm0, %xmm1 6464; SSE-NEXT: pandn %xmm15, %xmm0 6465; SSE-NEXT: por %xmm1, %xmm0 6466; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6467; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] 6468; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 6469; SSE-NEXT: movdqa %xmm14, %xmm10 6470; SSE-NEXT: pandn %xmm0, %xmm10 6471; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] 6472; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6473; SSE-NEXT: pand %xmm14, %xmm0 6474; SSE-NEXT: por %xmm0, %xmm10 6475; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] 6476; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 6477; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] 6478; SSE-NEXT: movdqa %xmm1, %xmm15 6479; SSE-NEXT: pandn %xmm0, %xmm15 6480; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] 6481; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6482; SSE-NEXT: pand %xmm1, %xmm0 6483; SSE-NEXT: movdqa %xmm1, %xmm2 6484; SSE-NEXT: por %xmm0, %xmm15 6485; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 6486; SSE-NEXT: pand %xmm0, %xmm15 6487; SSE-NEXT: pandn %xmm10, %xmm0 6488; SSE-NEXT: por %xmm15, %xmm0 6489; SSE-NEXT: movdqa %xmm0, %xmm1 6490; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] 6491; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 6492; SSE-NEXT: movdqa %xmm7, %xmm10 6493; SSE-NEXT: pandn %xmm0, %xmm10 6494; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] 6495; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] 6496; SSE-NEXT: pand %xmm7, %xmm0 6497; SSE-NEXT: por %xmm10, %xmm0 6498; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,7,7,7] 6499; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,2] 6500; SSE-NEXT: movdqa %xmm4, %xmm14 6501; SSE-NEXT: movdqa %xmm4, %xmm15 6502; SSE-NEXT: pandn %xmm10, %xmm15 6503; SSE-NEXT: pand %xmm4, %xmm0 6504; SSE-NEXT: por %xmm0, %xmm15 6505; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 6506; SSE-NEXT: pand %xmm0, %xmm1 6507; SSE-NEXT: pandn %xmm15, %xmm0 6508; SSE-NEXT: por %xmm1, %xmm0 6509; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 6510; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6511; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6512; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7] 6513; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 6514; SSE-NEXT: movdqa %xmm2, %xmm10 6515; SSE-NEXT: pandn %xmm0, %xmm10 6516; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6517; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] 6518; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6519; SSE-NEXT: pand %xmm2, %xmm0 6520; SSE-NEXT: movdqa %xmm2, %xmm6 6521; SSE-NEXT: por %xmm0, %xmm10 6522; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6523; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6524; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,1,3,4,5,6,7] 6525; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 6526; SSE-NEXT: movdqa %xmm4, %xmm15 6527; SSE-NEXT: pandn %xmm0, %xmm15 6528; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6529; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] 6530; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6531; SSE-NEXT: pand %xmm4, %xmm0 6532; SSE-NEXT: por %xmm0, %xmm15 6533; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 6534; SSE-NEXT: pand %xmm0, %xmm15 6535; SSE-NEXT: pandn %xmm10, %xmm0 6536; SSE-NEXT: por %xmm15, %xmm0 6537; SSE-NEXT: movdqa %xmm0, %xmm3 6538; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6539; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] 6540; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6541; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 6542; SSE-NEXT: pand %xmm15, %xmm0 6543; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6544; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 6545; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7] 6546; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] 6547; SSE-NEXT: pandn %xmm10, %xmm15 6548; SSE-NEXT: por %xmm0, %xmm15 6549; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6550; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] 6551; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6552; SSE-NEXT: movdqa %xmm7, %xmm10 6553; SSE-NEXT: pandn %xmm0, %xmm10 6554; SSE-NEXT: pand %xmm7, %xmm15 6555; SSE-NEXT: por %xmm15, %xmm10 6556; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 6557; SSE-NEXT: pand %xmm0, %xmm3 6558; SSE-NEXT: pandn %xmm10, %xmm0 6559; SSE-NEXT: por %xmm3, %xmm0 6560; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6561; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,2] 6562; SSE-NEXT: movdqa %xmm7, %xmm3 6563; SSE-NEXT: pandn %xmm0, %xmm3 6564; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] 6565; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6566; SSE-NEXT: pand %xmm7, %xmm0 6567; SSE-NEXT: por %xmm3, %xmm0 6568; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 6569; SSE-NEXT: movdqa %xmm4, %xmm10 6570; SSE-NEXT: pandn %xmm0, %xmm10 6571; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,2,1] 6572; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] 6573; SSE-NEXT: movdqa %xmm14, %xmm3 6574; SSE-NEXT: pandn %xmm0, %xmm3 6575; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,2,4,5,6,7] 6576; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6577; SSE-NEXT: pand %xmm14, %xmm0 6578; SSE-NEXT: por %xmm0, %xmm3 6579; SSE-NEXT: pand %xmm4, %xmm3 6580; SSE-NEXT: por %xmm10, %xmm3 6581; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] 6582; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6583; SSE-NEXT: movdqa %xmm6, %xmm4 6584; SSE-NEXT: pand %xmm6, %xmm0 6585; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] 6586; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] 6587; SSE-NEXT: pandn %xmm10, %xmm4 6588; SSE-NEXT: por %xmm0, %xmm4 6589; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] 6590; SSE-NEXT: movdqa %xmm1, %xmm15 6591; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6592; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] 6593; SSE-NEXT: movdqa %xmm6, %xmm10 6594; SSE-NEXT: pandn %xmm0, %xmm10 6595; SSE-NEXT: pand %xmm6, %xmm4 6596; SSE-NEXT: por %xmm4, %xmm10 6597; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 6598; SSE-NEXT: pand %xmm1, %xmm3 6599; SSE-NEXT: pandn %xmm10, %xmm1 6600; SSE-NEXT: por %xmm3, %xmm1 6601; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] 6602; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] 6603; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] 6604; SSE-NEXT: pand %xmm7, %xmm3 6605; SSE-NEXT: pandn %xmm0, %xmm7 6606; SSE-NEXT: por %xmm3, %xmm7 6607; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] 6608; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 6609; SSE-NEXT: pand %xmm6, %xmm0 6610; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,6,7] 6611; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] 6612; SSE-NEXT: pandn %xmm3, %xmm6 6613; SSE-NEXT: por %xmm0, %xmm6 6614; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 6615; SSE-NEXT: pand %xmm0, %xmm6 6616; SSE-NEXT: pandn %xmm7, %xmm0 6617; SSE-NEXT: por %xmm6, %xmm0 6618; SSE-NEXT: movdqa %xmm0, %xmm4 6619; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] 6620; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] 6621; SSE-NEXT: pand %xmm14, %xmm0 6622; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,5,7] 6623; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] 6624; SSE-NEXT: pandn %xmm3, %xmm14 6625; SSE-NEXT: por %xmm0, %xmm14 6626; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] 6627; SSE-NEXT: pand %xmm3, %xmm14 6628; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] 6629; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 6630; SSE-NEXT: pandn %xmm0, %xmm3 6631; SSE-NEXT: por %xmm14, %xmm3 6632; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] 6633; SSE-NEXT: pand %xmm0, %xmm3 6634; SSE-NEXT: pandn %xmm4, %xmm0 6635; SSE-NEXT: por %xmm0, %xmm3 6636; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 6637; SSE-NEXT: movdqa %xmm3, 368(%rax) 6638; SSE-NEXT: movdqa %xmm1, 352(%rax) 6639; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6640; SSE-NEXT: movaps %xmm0, 336(%rax) 6641; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 6642; SSE-NEXT: movaps %xmm0, 320(%rax) 6643; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6644; SSE-NEXT: movaps %xmm0, 288(%rax) 6645; SSE-NEXT: movdqa %xmm9, 256(%rax) 6646; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6647; SSE-NEXT: movaps %xmm0, 240(%rax) 6648; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6649; SSE-NEXT: movaps %xmm0, 224(%rax) 6650; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6651; SSE-NEXT: movaps %xmm0, 208(%rax) 6652; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6653; SSE-NEXT: movaps %xmm0, 176(%rax) 6654; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6655; SSE-NEXT: movaps %xmm0, 144(%rax) 6656; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6657; SSE-NEXT: movaps %xmm0, 128(%rax) 6658; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6659; SSE-NEXT: movaps %xmm0, 112(%rax) 6660; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6661; SSE-NEXT: movaps %xmm0, 96(%rax) 6662; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6663; SSE-NEXT: movaps %xmm0, 64(%rax) 6664; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6665; SSE-NEXT: movaps %xmm0, 32(%rax) 6666; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6667; SSE-NEXT: movaps %xmm0, 16(%rax) 6668; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6669; SSE-NEXT: movaps %xmm0, (%rax) 6670; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6671; SSE-NEXT: movaps %xmm0, 304(%rax) 6672; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6673; SSE-NEXT: movaps %xmm0, 192(%rax) 6674; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6675; SSE-NEXT: movaps %xmm0, 80(%rax) 6676; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6677; SSE-NEXT: movaps %xmm0, 272(%rax) 6678; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6679; SSE-NEXT: movaps %xmm0, 160(%rax) 6680; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6681; SSE-NEXT: movaps %xmm0, 48(%rax) 6682; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6683; SSE-NEXT: movaps %xmm0, 432(%rax) 6684; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6685; SSE-NEXT: movaps %xmm0, 400(%rax) 6686; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6687; SSE-NEXT: movaps %xmm0, 416(%rax) 6688; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6689; SSE-NEXT: movaps %xmm0, 384(%rax) 6690; SSE-NEXT: addq $648, %rsp # imm = 0x288 6691; SSE-NEXT: retq 6692; 6693; AVX-LABEL: store_i8_stride7_vf64: 6694; AVX: # %bb.0: 6695; AVX-NEXT: subq $616, %rsp # imm = 0x268 6696; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 6697; AVX-NEXT: vmovdqa 16(%rax), %xmm6 6698; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] 6699; AVX-NEXT: vmovdqa 16(%r8), %xmm10 6700; AVX-NEXT: vmovdqa 16(%r9), %xmm8 6701; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] 6702; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero 6703; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 6704; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero 6705; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] 6706; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6707; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6708; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] 6709; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 6710; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 6711; AVX-NEXT: vmovdqa 16(%rsi), %xmm7 6712; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 6713; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] 6714; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] 6715; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] 6716; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 6717; AVX-NEXT: vmovdqa 16(%rcx), %xmm12 6718; AVX-NEXT: vmovdqa 16(%rdx), %xmm13 6719; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] 6720; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] 6721; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 6722; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6723; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6724; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] 6725; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 6726; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 6727; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3 6728; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4 6729; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 6730; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 6731; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 6732; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 6733; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 6734; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6735; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u] 6736; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6737; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] 6738; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 6739; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u] 6740; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1 6741; AVX-NEXT: vmovdqa %xmm3, %xmm8 6742; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u] 6743; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3 6744; AVX-NEXT: vmovdqa %xmm4, %xmm10 6745; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 6746; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u] 6747; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 6748; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u] 6749; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 6750; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 6751; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 6752; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] 6753; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm1 6754; AVX-NEXT: vmovdqa %xmm3, %xmm12 6755; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u] 6756; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm3 6757; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 6758; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] 6759; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 6760; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128] 6761; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6762; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm2 6763; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9] 6764; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6765; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm3 6766; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 6767; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] 6768; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] 6769; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3 6770; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 6771; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 6772; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1 6773; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2 6774; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 6775; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 6776; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 6777; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 6778; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 6779; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6780; AVX-NEXT: vmovdqa 32(%r8), %xmm3 6781; AVX-NEXT: vmovdqa 32(%r9), %xmm11 6782; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm0 6783; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm1 6784; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 6785; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 6786; AVX-NEXT: vmovdqa 32(%rax), %xmm8 6787; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm1 6788; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 6789; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero 6790; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] 6791; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6792; AVX-NEXT: vmovdqa %xmm3, %xmm10 6793; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6794; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6795; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] 6796; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 6797; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 6798; AVX-NEXT: vmovdqa 32(%rcx), %xmm0 6799; AVX-NEXT: vmovdqa 32(%rdx), %xmm2 6800; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm1 6801; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm3 6802; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 6803; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 6804; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6805; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] 6806; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5 6807; AVX-NEXT: vmovdqa 32(%rsi), %xmm1 6808; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 6809; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm6 6810; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm7 6811; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 6812; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 6813; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6814; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] 6815; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7 6816; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 6817; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 6818; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 6819; AVX-NEXT: vandps %ymm7, %ymm6, %ymm6 6820; AVX-NEXT: vorps %ymm5, %ymm6, %ymm5 6821; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 6822; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4 6823; AVX-NEXT: vandps %ymm6, %ymm5, %ymm5 6824; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 6825; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6826; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6827; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero 6828; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] 6829; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] 6830; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 6831; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] 6832; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] 6833; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 6834; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 6835; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 6836; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6837; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 6838; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1 6839; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] 6840; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 6841; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 6842; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 6843; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6844; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] 6845; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 6846; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] 6847; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 6848; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 6849; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 6850; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 6851; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 6852; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 6853; AVX-NEXT: vandnps %ymm4, %ymm2, %ymm1 6854; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 6855; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 6856; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6857; AVX-NEXT: vmovdqa 48(%rax), %xmm12 6858; AVX-NEXT: vmovdqa 48(%r8), %xmm2 6859; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6860; AVX-NEXT: vmovdqa 48(%r9), %xmm1 6861; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6862; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128] 6863; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm0 6864; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 6865; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6866; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13] 6867; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm1 6868; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 6869; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] 6870; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] 6871; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 6872; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 6873; AVX-NEXT: vmovdqa 48(%rsi), %xmm1 6874; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6875; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 6876; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 6877; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6878; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm1 6879; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] 6880; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 6881; AVX-NEXT: vmovdqa 48(%rcx), %xmm2 6882; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 6883; AVX-NEXT: vmovdqa 48(%rdx), %xmm13 6884; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] 6885; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6886; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] 6887; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm3 6888; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] 6889; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 6890; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1 6891; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3 6892; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 6893; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 6894; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 6895; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 6896; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 6897; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6898; AVX-NEXT: vmovdqa (%r9), %xmm6 6899; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u] 6900; AVX-NEXT: vmovdqa (%r8), %xmm8 6901; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u] 6902; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 6903; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] 6904; AVX-NEXT: vmovdqa (%rax), %xmm7 6905; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u] 6906; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 6907; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm1 6908; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 6909; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6910; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6911; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6912; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm15 6913; AVX-NEXT: vpor %xmm1, %xmm15, %xmm1 6914; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 6915; AVX-NEXT: vmovdqa (%rcx), %xmm5 6916; AVX-NEXT: vmovdqa (%rdx), %xmm3 6917; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] 6918; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] 6919; AVX-NEXT: vpor %xmm0, %xmm14, %xmm0 6920; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 6921; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6922; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm14 6923; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 6924; AVX-NEXT: vmovdqa (%rsi), %xmm14 6925; AVX-NEXT: vmovdqa (%rdi), %xmm2 6926; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero 6927; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] 6928; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 6929; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 6930; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6931; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] 6932; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 6933; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] 6934; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4 6935; AVX-NEXT: vandps %ymm1, %ymm9, %ymm1 6936; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1 6937; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 6938; AVX-NEXT: vandnps %ymm15, %ymm0, %ymm4 6939; AVX-NEXT: vandps %ymm0, %ymm1, %ymm1 6940; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1 6941; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6942; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6943; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero 6944; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] 6945; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] 6946; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 6947; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] 6948; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] 6949; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 6950; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 6951; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] 6952; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6953; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] 6954; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] 6955; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 6956; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] 6957; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm4 6958; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 6959; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 6960; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 6961; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6962; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] 6963; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm2 6964; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] 6965; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 6966; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 6967; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] 6968; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0 6969; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2 6970; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 6971; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 6972; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 6973; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 6974; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 6975; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6976; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6977; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 6978; AVX-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 6979; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6980; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6981; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero 6982; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] 6983; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 6984; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] 6985; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] 6986; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 6987; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 6988; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6989; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6990; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 6991; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6992; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] 6993; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] 6994; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 6995; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6996; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6997; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 6998; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6999; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] 7000; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] 7001; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm11 7002; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 7003; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3 7004; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4 7005; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 7006; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 7007; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0 7008; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3 7009; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 7010; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7011; AVX-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload 7012; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u] 7013; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] 7014; AVX-NEXT: vpor %xmm0, %xmm3, %xmm3 7015; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 7016; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm4 7017; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 7018; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7019; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero 7020; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] 7021; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 7022; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] 7023; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm11 7024; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 7025; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] 7026; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 7027; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 7028; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 7029; AVX-NEXT: vextractf128 $1, %ymm3, %xmm4 7030; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15] 7031; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7032; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero 7033; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 7034; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] 7035; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7036; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero 7037; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 7038; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15] 7039; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero 7040; AVX-NEXT: vpor %xmm4, %xmm11, %xmm1 7041; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7042; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15] 7043; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero 7044; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 7045; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15] 7046; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero 7047; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 7048; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15] 7049; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero 7050; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1 7051; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7052; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] 7053; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm4 7054; AVX-NEXT: vpmovsxdq {{.*#+}} xmm10 = [218890240,986624] 7055; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 7056; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 7057; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] 7058; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 7059; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] 7060; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0 7061; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 7062; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 7063; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3 7064; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0 7065; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 7066; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero 7067; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] 7068; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 7069; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15] 7070; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero 7071; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 7072; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15] 7073; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero 7074; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1 7075; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7076; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 7077; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 7078; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] 7079; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u] 7080; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 7081; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero 7082; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] 7083; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 7084; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7085; AVX-NEXT: vpmovsxdq {{.*#+}} xmm15 = [16777216,197120] 7086; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7087; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 7088; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7089; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm3 7090; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 7091; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5] 7092; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7093; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm4 7094; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7095; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm7 7096; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 7097; AVX-NEXT: vandnps %ymm0, %ymm8, %ymm0 7098; AVX-NEXT: vandps %ymm4, %ymm8, %ymm4 7099; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 7100; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7101; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload 7102; AVX-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 7103; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero 7104; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7105; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15] 7106; AVX-NEXT: vpor %xmm7, %xmm4, %xmm7 7107; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] 7108; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7109; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm10 7110; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] 7111; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] 7112; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm5 7113; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 7114; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 7115; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0 7116; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 7117; AVX-NEXT: vorps %ymm5, %ymm0, %ymm0 7118; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7119; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7120; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5 7121; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm10 7122; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 7123; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7124; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] 7125; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm10 7126; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm12 7127; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 7128; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] 7129; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5 7130; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10 7131; AVX-NEXT: vorps %ymm5, %ymm10, %ymm5 7132; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u] 7133; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm10 7134; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7135; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u] 7136; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm13 7137; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 7138; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm13 7139; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] 7140; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm13 7141; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 7142; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 7143; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 7144; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10 7145; AVX-NEXT: vorps %ymm5, %ymm10, %ymm10 7146; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7147; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5 7148; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm14 7149; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 7150; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7151; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm14 7152; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm8 7153; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 7154; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5 7155; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8 7156; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5 7157; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7158; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm8 7159; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7160; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm12 7161; AVX-NEXT: vpor %xmm8, %xmm12, %xmm8 7162; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm12 7163; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] 7164; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12 7165; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 7166; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 7167; AVX-NEXT: vandnps %ymm8, %ymm13, %ymm8 7168; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5 7169; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7170; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm8 7171; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7172; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9 7173; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 7174; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7175; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm2 7176; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7177; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] 7178; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 7179; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] 7180; AVX-NEXT: vandnps %ymm8, %ymm0, %ymm3 7181; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 7182; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 7183; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] 7184; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7185; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 7186; AVX-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 7187; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero 7188; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 7189; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7190; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm1 7191; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7192; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 7193; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 7194; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 7195; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2 7196; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1 7197; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 7198; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 7199; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7200; AVX-NEXT: vmovaps %ymm0, 128(%rax) 7201; AVX-NEXT: vmovaps %ymm1, 96(%rax) 7202; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7203; AVX-NEXT: vmovaps %ymm0, 64(%rax) 7204; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7205; AVX-NEXT: vmovaps %ymm0, 32(%rax) 7206; AVX-NEXT: vmovaps %ymm5, (%rax) 7207; AVX-NEXT: vmovaps %ymm10, 224(%rax) 7208; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7209; AVX-NEXT: vmovaps %ymm1, 352(%rax) 7210; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7211; AVX-NEXT: vmovaps %ymm0, 320(%rax) 7212; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7213; AVX-NEXT: vmovaps %ymm0, 288(%rax) 7214; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7215; AVX-NEXT: vmovaps %ymm0, 256(%rax) 7216; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7217; AVX-NEXT: vmovaps %ymm0, 160(%rax) 7218; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7219; AVX-NEXT: vmovaps %ymm0, 192(%rax) 7220; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7221; AVX-NEXT: vmovaps %xmm0, 432(%rax) 7222; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7223; AVX-NEXT: vmovaps %xmm0, 416(%rax) 7224; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7225; AVX-NEXT: vmovaps %xmm0, 384(%rax) 7226; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7227; AVX-NEXT: vmovaps %xmm0, 400(%rax) 7228; AVX-NEXT: addq $616, %rsp # imm = 0x268 7229; AVX-NEXT: vzeroupper 7230; AVX-NEXT: retq 7231; 7232; AVX2-LABEL: store_i8_stride7_vf64: 7233; AVX2: # %bb.0: 7234; AVX2-NEXT: subq $824, %rsp # imm = 0x338 7235; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 7236; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 7237; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7238; AVX2-NEXT: vmovdqa 32(%rsi), %ymm2 7239; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7240; AVX2-NEXT: vmovdqa 32(%rdx), %ymm6 7241; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7242; AVX2-NEXT: vmovdqa 32(%rcx), %ymm7 7243; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7244; AVX2-NEXT: vmovdqa 32(%r8), %ymm5 7245; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7246; AVX2-NEXT: vmovdqa 32(%r9), %ymm4 7247; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7248; AVX2-NEXT: vmovdqa 32(%rax), %ymm3 7249; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7250; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] 7251; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero 7252; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 7253; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 7254; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero 7255; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] 7256; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 7257; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7258; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 7259; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7260; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] 7261; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7262; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255] 7263; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7264; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] 7265; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7266; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255] 7267; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7268; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 7269; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7270; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 7271; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7272; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7273; AVX2-NEXT: vmovdqa (%r8), %ymm0 7274; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7275; AVX2-NEXT: vmovdqa (%r9), %ymm1 7276; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7277; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] 7278; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero 7279; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 7280; AVX2-NEXT: vmovdqa (%rax), %ymm1 7281; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7282; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 7283; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7284; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 7285; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] 7286; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7287; AVX2-NEXT: vmovdqa (%rdx), %ymm1 7288; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7289; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 7290; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] 7291; AVX2-NEXT: vmovdqa (%rcx), %ymm2 7292; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7293; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] 7294; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] 7295; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 7296; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7297; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 7298; AVX2-NEXT: vmovdqa (%rdi), %ymm2 7299; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7300; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 7301; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] 7302; AVX2-NEXT: vmovdqa (%rsi), %ymm3 7303; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7304; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] 7305; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] 7306; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 7307; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7308; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 7309; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7310; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 7311; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 7312; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 7313; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 7314; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 7315; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7316; AVX2-NEXT: vmovdqa 32(%rsi), %xmm13 7317; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 7318; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 7319; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7320; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 7321; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 7322; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7323; AVX2-NEXT: vmovdqa 32(%rcx), %xmm11 7324; AVX2-NEXT: vmovdqa 32(%rdx), %xmm9 7325; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] 7326; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 7327; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 7328; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7329; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] 7330; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm10 7331; AVX2-NEXT: vmovdqa (%rsi), %xmm0 7332; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7333; AVX2-NEXT: vmovdqa (%rdi), %xmm6 7334; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 7335; AVX2-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill 7336; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 7337; AVX2-NEXT: vmovdqa (%rcx), %xmm1 7338; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7339; AVX2-NEXT: vmovdqa (%rdx), %xmm7 7340; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] 7341; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7342; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 7343; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7344; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7345; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 7346; AVX2-NEXT: vmovdqa 32(%r9), %xmm0 7347; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7348; AVX2-NEXT: vmovdqa 32(%r8), %xmm8 7349; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] 7350; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 7351; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 7352; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7353; AVX2-NEXT: vmovdqa 32(%rax), %xmm2 7354; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7355; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7] 7356; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] 7357; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] 7358; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 7359; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 7360; AVX2-NEXT: vmovdqa (%r9), %xmm3 7361; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7362; AVX2-NEXT: vmovdqa (%r8), %xmm5 7363; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7364; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 7365; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm0 7366; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] 7367; AVX2-NEXT: vmovdqa (%rax), %xmm0 7368; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7369; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7] 7370; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] 7371; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] 7372; AVX2-NEXT: vpblendvb %ymm2, %ymm12, %ymm15, %ymm2 7373; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 7374; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm1, %ymm0 7375; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7376; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm0 7377; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7378; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 7379; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm2 7380; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 7381; AVX2-NEXT: vpshufb %xmm4, %xmm14, %xmm12 7382; AVX2-NEXT: vpor %xmm2, %xmm12, %xmm2 7383; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 7384; AVX2-NEXT: vpshufb %xmm15, %xmm11, %xmm12 7385; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 7386; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm14 7387; AVX2-NEXT: vpor %xmm12, %xmm14, %xmm12 7388; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7389; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] 7390; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 7391; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 7392; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7393; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7394; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 7395; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm2 7396; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 7397; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7398; AVX2-NEXT: vpshufb %xmm15, %xmm6, %xmm2 7399; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 7400; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 7401; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7402; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7403; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm12 7404; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 7405; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 7406; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm2 7407; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 7408; AVX2-NEXT: vmovdqa %xmm8, %xmm3 7409; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm14 7410; AVX2-NEXT: vpor %xmm2, %xmm14, %xmm2 7411; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7412; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 7413; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7414; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm15 7415; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] 7416; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 7417; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 7418; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 7419; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm1 7420; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7421; AVX2-NEXT: vpshufb %xmm4, %xmm15, %xmm4 7422; AVX2-NEXT: vpor %xmm1, %xmm4, %xmm1 7423; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7424; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7425; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm4 7426; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,0] 7427; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0 7428; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 7429; AVX2-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7430; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7431; AVX2-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 7432; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7433; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] 7434; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload 7435; AVX2-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] 7436; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 7437; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 7438; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7439; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 7440; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 7441; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7442; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] 7443; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 7444; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload 7445; AVX2-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] 7446; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 7447; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7448; AVX2-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload 7449; AVX2-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 7450; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2 7451; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7452; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7453; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 7454; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 7455; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 7456; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 7457; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7458; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6] 7459; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 7460; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7461; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 7462; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 7463; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] 7464; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 7465; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 7466; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6] 7467; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 7468; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7469; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 7470; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 7471; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 7472; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7473; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 7474; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7475; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 7476; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 7477; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7478; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm2 7479; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] 7480; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7481; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm5 7482; AVX2-NEXT: vpor %ymm2, %ymm5, %ymm2 7483; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 7484; AVX2-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 7485; AVX2-NEXT: # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 7486; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] 7487; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] 7488; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] 7489; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 7490; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7491; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7492; AVX2-NEXT: vpshufb %ymm3, %ymm8, %ymm3 7493; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 7494; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm4 7495; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 7496; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] 7497; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 7498; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 7499; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] 7500; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] 7501; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 7502; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7503; AVX2-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 7504; AVX2-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 7505; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] 7506; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] 7507; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7508; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm6 7509; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] 7510; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 7511; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 7512; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7513; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm5 7514; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7515; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 7516; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5] 7517; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 7518; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 7519; AVX2-NEXT: # ymm6 = mem[0,1,0,1] 7520; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7521; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm7 7522; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] 7523; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7524; AVX2-NEXT: vpshufb %ymm3, %ymm10, %ymm9 7525; AVX2-NEXT: vpor %ymm7, %ymm9, %ymm7 7526; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 7527; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 7528; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] 7529; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 7530; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7531; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm6 7532; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 7533; AVX2-NEXT: vpshufb %ymm3, %ymm12, %ymm7 7534; AVX2-NEXT: vpor %ymm6, %ymm7, %ymm6 7535; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 7536; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 7537; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 7538; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 7539; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7540; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 7541; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7542; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7543; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 7544; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7545; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] 7546; AVX2-NEXT: vmovdqa %ymm1, %ymm0 7547; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero 7548; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 7549; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero 7550; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] 7551; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 7552; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 7553; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 7554; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 7555; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 7556; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero 7557; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero 7558; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 7559; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 7560; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 7561; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 7562; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] 7563; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 7564; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 7565; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm1 7566; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7567; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 7568; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm5 7569; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] 7570; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7571; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm8 7572; AVX2-NEXT: vpor %ymm5, %ymm8, %ymm5 7573; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 7574; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm9 7575; AVX2-NEXT: vmovdqa %ymm2, %ymm3 7576; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] 7577; AVX2-NEXT: vpshufb %ymm1, %ymm10, %ymm11 7578; AVX2-NEXT: vmovdqa %ymm10, %ymm2 7579; AVX2-NEXT: vpor %ymm9, %ymm11, %ymm9 7580; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] 7581; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm9, %ymm5 7582; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 7583; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7584; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 7585; AVX2-NEXT: vpor %ymm6, %ymm7, %ymm6 7586; AVX2-NEXT: vpshufb %ymm8, %ymm15, %ymm7 7587; AVX2-NEXT: vpshufb %ymm1, %ymm12, %ymm8 7588; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 7589; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm6 7590; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] 7591; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7592; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm8 7593; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 7594; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7595; AVX2-NEXT: vpshufb %ymm9, %ymm15, %ymm10 7596; AVX2-NEXT: vpor %ymm8, %ymm10, %ymm8 7597; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 7598; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7599; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11 7600; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] 7601; AVX2-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 7602; AVX2-NEXT: vpshufb %ymm7, %ymm13, %ymm7 7603; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7604; AVX2-NEXT: vpshufb %ymm9, %ymm11, %ymm9 7605; AVX2-NEXT: vpor %ymm7, %ymm9, %ymm7 7606; AVX2-NEXT: vpshufb %ymm10, %ymm14, %ymm9 7607; AVX2-NEXT: vpblendvb %ymm12, %ymm7, %ymm9, %ymm7 7608; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 7609; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 7610; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 7611; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] 7612; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 7613; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] 7614; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7615; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 7616; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7617; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] 7618; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 7619; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] 7620; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7621; AVX2-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 7622; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 7623; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 7624; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 7625; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 7626; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] 7627; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] 7628; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u] 7629; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 7630; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] 7631; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] 7632; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u] 7633; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 7634; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 7635; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 7636; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 7637; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 7638; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 7639; AVX2-NEXT: vmovdqa %ymm6, 96(%rax) 7640; AVX2-NEXT: vmovdqa %ymm5, 320(%rax) 7641; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7642; AVX2-NEXT: vmovaps %ymm0, 160(%rax) 7643; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7644; AVX2-NEXT: vmovaps %ymm0, 128(%rax) 7645; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7646; AVX2-NEXT: vmovaps %ymm0, 64(%rax) 7647; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7648; AVX2-NEXT: vmovaps %ymm0, 32(%rax) 7649; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7650; AVX2-NEXT: vmovaps %ymm0, (%rax) 7651; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7652; AVX2-NEXT: vmovaps %ymm0, 224(%rax) 7653; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7654; AVX2-NEXT: vmovaps %ymm0, 352(%rax) 7655; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7656; AVX2-NEXT: vmovaps %ymm0, 288(%rax) 7657; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7658; AVX2-NEXT: vmovaps %ymm0, 256(%rax) 7659; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7660; AVX2-NEXT: vmovaps %ymm0, 192(%rax) 7661; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7662; AVX2-NEXT: vmovaps %ymm0, 384(%rax) 7663; AVX2-NEXT: vmovdqa %ymm7, 416(%rax) 7664; AVX2-NEXT: addq $824, %rsp # imm = 0x338 7665; AVX2-NEXT: vzeroupper 7666; AVX2-NEXT: retq 7667; 7668; AVX2-FP-LABEL: store_i8_stride7_vf64: 7669; AVX2-FP: # %bb.0: 7670; AVX2-FP-NEXT: subq $616, %rsp # imm = 0x268 7671; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7672; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 7673; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 7674; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm6 7675; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 7676; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm4 7677; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm5 7678; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm3 7679; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25] 7680; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7681; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero 7682; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7683; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 7684; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 7685; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero 7686; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm7 7687; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7688; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] 7689; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7690; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 7691; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7692; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 7693; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7694; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] 7695; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7696; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7697; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255] 7698; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7699; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7700; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] 7701; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7702; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255] 7703; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7704; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 7705; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7706; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7707; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 7708; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7709; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7710; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] 7711; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero 7712; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 7713; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero 7714; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero 7715; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 7716; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 7717; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7718; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 7719; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7720; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] 7721; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 7722; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u] 7723; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7724; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] 7725; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 7726; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u] 7727; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7728; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 7729; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 7730; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 7731; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 7732; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7733; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm15 7734; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm7 7735; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] 7736; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7737; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7738; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 7739; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 7740; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7741; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm9 7742; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm11 7743; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] 7744; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 7745; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 7746; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7747; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] 7748; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 7749; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7750; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 7751; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7752; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 7753; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 7754; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7755; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 7756; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 7757; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7758; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm10 7759; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 7760; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7761; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 7762; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7763; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7764; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 7765; AVX2-FP-NEXT: vmovdqa 32(%rax), %xmm0 7766; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7767; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] 7768; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm1 7769; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] 7770; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0 7771; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 7772; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm14 7773; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] 7774; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 7775; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 7776; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 7777; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 7778; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 7779; AVX2-FP-NEXT: vmovdqa (%r9), %xmm5 7780; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7781; AVX2-FP-NEXT: vmovdqa (%r8), %xmm8 7782; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7783; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] 7784; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm12 7785; AVX2-FP-NEXT: vmovdqa (%rax), %xmm13 7786; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 7787; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] 7788; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] 7789; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm12, %ymm6, %ymm2 7790; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 7791; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 7792; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7793; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 7794; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7795; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 7796; AVX2-FP-NEXT: vpshufb %xmm1, %xmm15, %xmm2 7797; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 7798; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm6 7799; AVX2-FP-NEXT: vpor %xmm2, %xmm6, %xmm2 7800; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 7801; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm12 7802; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 7803; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm7 7804; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm15 7805; AVX2-FP-NEXT: vpor %xmm12, %xmm15, %xmm12 7806; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7807; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] 7808; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 7809; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm12, %ymm2 7810; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7811; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7812; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 7813; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 7814; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 7815; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7816; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm2 7817; AVX2-FP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 7818; AVX2-FP-NEXT: vpor %xmm2, %xmm0, %xmm0 7819; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7820; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7821; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 7822; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 7823; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload 7824; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm2 7825; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 7826; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm6 7827; AVX2-FP-NEXT: vpor %xmm2, %xmm6, %xmm2 7828; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7829; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 7830; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7831; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm15 7832; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] 7833; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 7834; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 7835; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 7836; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 7837; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7838; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 7839; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 7840; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7841; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm3 7842; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] 7843; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 7844; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 7845; AVX2-FP-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7846; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7847; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm11, %ymm0, %ymm0 7848; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7849; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] 7850; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7851; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 7852; AVX2-FP-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 7853; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 7854; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 7855; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 7856; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 7857; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 7858; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7859; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] 7860; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 7861; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload 7862; AVX2-FP-NEXT: # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] 7863; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 7864; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload 7865; AVX2-FP-NEXT: # xmm2 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] 7866; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 7867; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 7868; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7869; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 7870; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] 7871; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] 7872; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 7873; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7874; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 7875; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 7876; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 7877; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 7878; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 7879; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm3 7880; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] 7881; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 7882; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 7883; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 7884; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 7885; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 7886; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 7887; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7888; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 7889; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7890; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 7891; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] 7892; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm5 7893; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero 7894; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm2 7895; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 7896; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero 7897; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm13 7898; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero 7899; AVX2-FP-NEXT: vpor %ymm4, %ymm6, %ymm4 7900; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 7901; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 7902; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 7903; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm6 7904; AVX2-FP-NEXT: vmovdqa (%r8), %ymm14 7905; AVX2-FP-NEXT: vmovdqa (%r9), %ymm1 7906; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,ymm14[27,28,29,30],zero,ymm14[28],zero,ymm14[26,27,30,31],zero,ymm14[29] 7907; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero 7908; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 7909; AVX2-FP-NEXT: vpor %ymm2, %ymm7, %ymm7 7910; AVX2-FP-NEXT: vmovdqa (%rax), %ymm10 7911; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 7912; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7913; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 7914; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 7915; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] 7916; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 7917; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 7918; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 7919; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7920; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[23],zero,ymm13[27,20,21,26],zero,ymm13[24],zero,ymm13[26,27,26,27],zero,ymm13[25] 7921; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7922; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero 7923; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm2 7924; AVX2-FP-NEXT: vpor %ymm6, %ymm8, %ymm6 7925; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero 7926; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7927; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] 7928; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 7929; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 7930; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 7931; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 7932; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 7933; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero 7934; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero 7935; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7936; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 7937; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 7938; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 7939; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 7940; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] 7941; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 7942; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 7943; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 7944; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7945; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 7946; AVX2-FP-NEXT: # ymm9 = mem[0,1,0,1] 7947; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7948; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm8 7949; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] 7950; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7951; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm11 7952; AVX2-FP-NEXT: vpor %ymm8, %ymm11, %ymm8 7953; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] 7954; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] 7955; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7956; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm12 7957; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] 7958; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7959; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm15 7960; AVX2-FP-NEXT: vpor %ymm12, %ymm15, %ymm12 7961; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 7962; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 7963; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] 7964; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm4 7965; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7966; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 7967; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm10 7968; AVX2-FP-NEXT: vpor %ymm9, %ymm10, %ymm9 7969; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 7970; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm10 7971; AVX2-FP-NEXT: vpshufb %ymm0, %ymm13, %ymm11 7972; AVX2-FP-NEXT: vpor %ymm10, %ymm11, %ymm10 7973; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 7974; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 7975; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 7976; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 7977; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] 7978; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7979; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 7980; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] 7981; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7982; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm13 7983; AVX2-FP-NEXT: vpor %ymm11, %ymm13, %ymm11 7984; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 7985; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] 7986; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] 7987; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 7988; AVX2-FP-NEXT: vpshufb %ymm13, %ymm6, %ymm15 7989; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] 7990; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] 7991; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm15, %ymm11 7992; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload 7993; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm10 7994; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 7995; AVX2-FP-NEXT: vpor %ymm10, %ymm12, %ymm10 7996; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 7997; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7998; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 7999; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] 8000; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 8001; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 8002; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 8003; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8004; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 8005; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 8006; AVX2-FP-NEXT: vpshufb %ymm0, %ymm7, %ymm10 8007; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] 8008; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm12 8009; AVX2-FP-NEXT: vpor %ymm10, %ymm12, %ymm10 8010; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 8011; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8012; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 8013; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] 8014; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8015; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm7 8016; AVX2-FP-NEXT: vpor %ymm7, %ymm13, %ymm7 8017; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] 8018; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm10 8019; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 8020; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8021; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 8022; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 8023; AVX2-FP-NEXT: vpshufb %ymm12, %ymm5, %ymm1 8024; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8025; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 8026; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 8027; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 8028; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] 8029; AVX2-FP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 8030; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 8031; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm7 8032; AVX2-FP-NEXT: vpor %ymm3, %ymm7, %ymm3 8033; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 8034; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm11 8035; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] 8036; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 8037; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8038; AVX2-FP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 8039; AVX2-FP-NEXT: vpshufb %ymm5, %ymm15, %ymm4 8040; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1 8041; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 8042; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 8043; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 8044; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm3, %ymm3 8045; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8046; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8047; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) 8048; AVX2-FP-NEXT: vmovdqa %ymm3, 320(%rax) 8049; AVX2-FP-NEXT: vmovdqa %ymm9, 128(%rax) 8050; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8051; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) 8052; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8053; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) 8054; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8055; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) 8056; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8057; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) 8058; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8059; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) 8060; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8061; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) 8062; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8063; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) 8064; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8065; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) 8066; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8067; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) 8068; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8069; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) 8070; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8071; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax) 8072; AVX2-FP-NEXT: addq $616, %rsp # imm = 0x268 8073; AVX2-FP-NEXT: vzeroupper 8074; AVX2-FP-NEXT: retq 8075; 8076; AVX2-FCP-LABEL: store_i8_stride7_vf64: 8077; AVX2-FCP: # %bb.0: 8078; AVX2-FCP-NEXT: subq $616, %rsp # imm = 0x268 8079; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8080; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 8081; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 8082; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 8083; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 8084; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm4 8085; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm5 8086; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm3 8087; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25] 8088; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8089; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero 8090; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8091; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 8092; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 8093; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero 8094; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm7 8095; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8096; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] 8097; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8098; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 8099; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 8100; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 8101; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8102; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] 8103; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8104; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 8105; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255] 8106; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8107; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8108; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] 8109; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8110; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255] 8111; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8112; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 8113; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8114; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 8115; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] 8116; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8117; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8118; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] 8119; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero 8120; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 8121; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero 8122; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero 8123; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 8124; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 8125; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 8126; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 8127; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8128; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] 8129; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] 8130; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u] 8131; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8132; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] 8133; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 8134; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u] 8135; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8136; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 8137; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 8138; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] 8139; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8140; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8141; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm14 8142; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 8143; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8144; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 8145; AVX2-FCP-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill 8146; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 8147; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8148; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 8149; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm10 8150; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm7 8151; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] 8152; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 8153; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 8154; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8155; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] 8156; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 8157; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8158; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm11 8159; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm12 8160; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 8161; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8162; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8163; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8164; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2 8165; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8166; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 8167; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8168; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 8169; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 8170; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 8171; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8172; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm6 8173; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm0 8174; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8175; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] 8176; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] 8177; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm4 8178; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm8 8179; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13 8180; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] 8181; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 8182; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 8183; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] 8184; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] 8185; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm4, %ymm0 8186; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm4 8187; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8188; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,0,0,4,5,6,7] 8189; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3 8190; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm9 8191; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm5 8192; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8193; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 8194; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm2 8195; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8196; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 8197; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] 8198; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 8199; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8200; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 8201; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8202; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 8203; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm0 8204; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 8205; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8206; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 8207; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 8208; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 8209; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm6 8210; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 8211; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm14 8212; AVX2-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6 8213; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 8214; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 8215; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] 8216; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 8217; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8218; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 8219; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 8220; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 8221; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 8222; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm2 8223; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8224; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm3 8225; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 8226; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8227; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8228; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 8229; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 8230; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm3 8231; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm11 8232; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 8233; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm14 8234; AVX2-FCP-NEXT: vpor %xmm3, %xmm14, %xmm3 8235; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] 8236; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 8237; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8238; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm15 8239; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] 8240; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] 8241; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm15, %ymm3 8242; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm2 8243; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm12 8244; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 8245; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 8246; AVX2-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 8247; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8248; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8249; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm6 8250; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] 8251; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm6, %ymm0 8252; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] 8253; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 8254; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8255; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 8256; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8257; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] 8258; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 8259; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 8260; AVX2-FCP-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 8261; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 8262; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 8263; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 8264; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 8265; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 8266; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8267; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] 8268; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 8269; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 8270; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 8271; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8272; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 8273; AVX2-FCP-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] 8274; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 8275; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 8276; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8277; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 8278; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] 8279; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6] 8280; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] 8281; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] 8282; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 8283; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 8284; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 8285; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8286; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] 8287; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 8288; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6] 8289; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 8290; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 8291; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 8292; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 8293; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 8294; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] 8295; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 8296; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8297; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 8298; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8299; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 8300; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] 8301; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 8302; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero 8303; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm2 8304; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 8305; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero 8306; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 8307; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero 8308; AVX2-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 8309; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] 8310; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] 8311; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] 8312; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm6 8313; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm14 8314; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm1 8315; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,ymm14[27,28,29,30],zero,ymm14[28],zero,ymm14[26,27,30,31],zero,ymm14[29] 8316; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero 8317; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 8318; AVX2-FCP-NEXT: vpor %ymm2, %ymm7, %ymm7 8319; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm10 8320; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 8321; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8322; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 8323; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] 8324; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] 8325; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 8326; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] 8327; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 8328; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8329; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[23],zero,ymm13[27,20,21,26],zero,ymm13[24],zero,ymm13[26,27,26,27],zero,ymm13[25] 8330; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8331; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero 8332; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm2 8333; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 8334; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero 8335; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8336; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] 8337; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 8338; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] 8339; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 8340; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] 8341; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 8342; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero 8343; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero 8344; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8345; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 8346; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 8347; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 8348; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 8349; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] 8350; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 8351; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] 8352; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 8353; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8354; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 8355; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] 8356; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8357; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm8 8358; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] 8359; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8360; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm11 8361; AVX2-FCP-NEXT: vpor %ymm8, %ymm11, %ymm8 8362; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] 8363; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] 8364; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 8365; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm12 8366; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] 8367; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8368; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm15 8369; AVX2-FCP-NEXT: vpor %ymm12, %ymm15, %ymm12 8370; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 8371; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] 8372; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] 8373; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm4 8374; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8375; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 8376; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm10 8377; AVX2-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 8378; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 8379; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm10 8380; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm11 8381; AVX2-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 8382; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] 8383; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 8384; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 8385; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 8386; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 8387; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 8388; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 8389; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] 8390; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8391; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm13 8392; AVX2-FCP-NEXT: vpor %ymm11, %ymm13, %ymm11 8393; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 8394; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8395; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 8396; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] 8397; AVX2-FCP-NEXT: vpermd %ymm13, %ymm15, %ymm13 8398; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] 8399; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 8400; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload 8401; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm10 8402; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 8403; AVX2-FCP-NEXT: vpor %ymm10, %ymm12, %ymm10 8404; AVX2-FCP-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload 8405; AVX2-FCP-NEXT: # ymm12 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 8406; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 8407; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] 8408; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 8409; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] 8410; AVX2-FCP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload 8411; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 8412; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 8413; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm10 8414; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] 8415; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm12 8416; AVX2-FCP-NEXT: vpor %ymm10, %ymm12, %ymm10 8417; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 8418; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8419; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 8420; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] 8421; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8422; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm7 8423; AVX2-FCP-NEXT: vpor %ymm7, %ymm13, %ymm7 8424; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] 8425; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm10 8426; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 8427; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8428; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 8429; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 8430; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm1 8431; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8432; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 8433; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 8434; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 8435; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] 8436; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 8437; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 8438; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm7 8439; AVX2-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 8440; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 8441; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm11 8442; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] 8443; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 8444; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8445; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 8446; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 8447; AVX2-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 8448; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8449; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 8450; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 8451; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] 8452; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm3, %ymm3 8453; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 8454; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8455; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) 8456; AVX2-FCP-NEXT: vmovdqa %ymm3, 320(%rax) 8457; AVX2-FCP-NEXT: vmovdqa %ymm9, 128(%rax) 8458; AVX2-FCP-NEXT: vmovdqa %ymm14, 352(%rax) 8459; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8460; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax) 8461; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8462; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) 8463; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8464; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) 8465; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8466; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) 8467; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8468; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) 8469; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8470; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) 8471; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8472; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) 8473; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8474; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) 8475; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8476; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) 8477; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8478; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) 8479; AVX2-FCP-NEXT: addq $616, %rsp # imm = 0x268 8480; AVX2-FCP-NEXT: vzeroupper 8481; AVX2-FCP-NEXT: retq 8482; 8483; AVX512-LABEL: store_i8_stride7_vf64: 8484; AVX512: # %bb.0: 8485; AVX512-NEXT: subq $1384, %rsp # imm = 0x568 8486; AVX512-NEXT: vmovdqa (%rsi), %ymm7 8487; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] 8488; AVX512-NEXT: vmovdqa (%rdi), %ymm2 8489; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero 8490; AVX512-NEXT: vmovdqa64 %ymm2, %ymm20 8491; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 8492; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8493; AVX512-NEXT: vmovdqa (%rcx), %ymm15 8494; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 8495; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm0 8496; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23 8497; AVX512-NEXT: vmovdqa (%rdx), %ymm2 8498; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] 8499; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm1 8500; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26 8501; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 8502; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 8503; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8504; AVX512-NEXT: vmovdqa (%r8), %ymm14 8505; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] 8506; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm0 8507; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27 8508; AVX512-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8509; AVX512-NEXT: vmovdqa (%r9), %ymm8 8510; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] 8511; AVX512-NEXT: # ymm3 = mem[0,1,0,1] 8512; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm1 8513; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17 8514; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 8515; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8516; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10 8517; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] 8518; AVX512-NEXT: # ymm6 = mem[0,1,0,1] 8519; AVX512-NEXT: vpshufb %ymm6, %ymm10, %ymm0 8520; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9 8521; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero,ymm9[25] 8522; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8523; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8524; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] 8525; AVX512-NEXT: vpshufb %ymm1, %ymm9, %ymm0 8526; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 8527; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero 8528; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8529; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8530; AVX512-NEXT: vmovdqa 32(%rdx), %ymm5 8531; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] 8532; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 8533; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm2 8534; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4 8535; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero 8536; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 8537; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8538; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] 8539; AVX512-NEXT: # ymm0 = mem[0,1,0,1] 8540; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm2 8541; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21 8542; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] 8543; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 8544; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8545; AVX512-NEXT: vmovdqa 32(%r8), %ymm3 8546; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] 8547; AVX512-NEXT: # ymm0 = mem[0,1,0,1] 8548; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm11 8549; AVX512-NEXT: vmovdqa64 %ymm0, %ymm25 8550; AVX512-NEXT: vmovdqa 32(%r9), %ymm2 8551; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 8552; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 8553; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8554; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] 8555; AVX512-NEXT: # ymm11 = mem[0,1,0,1] 8556; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm12 8557; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31] 8558; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 8559; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8560; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 8561; AVX512-NEXT: vmovdqa 32(%rax), %ymm0 8562; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8563; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 8564; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 8565; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 8566; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8567; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm6 8568; AVX512-NEXT: vmovdqa64 %ymm7, %ymm22 8569; AVX512-NEXT: vmovdqa64 %ymm20, %ymm7 8570; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0 8571; AVX512-NEXT: vpshufb %ymm0, %ymm7, %ymm7 8572; AVX512-NEXT: vpor %ymm6, %ymm7, %ymm0 8573; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8574; AVX512-NEXT: vmovdqa64 %ymm18, %ymm12 8575; AVX512-NEXT: vpshufb %ymm1, %ymm12, %ymm6 8576; AVX512-NEXT: vmovdqa64 %ymm21, %ymm0 8577; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm7 8578; AVX512-NEXT: vmovdqa64 %ymm15, %ymm19 8579; AVX512-NEXT: vpor %ymm6, %ymm7, %ymm0 8580; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8581; AVX512-NEXT: vmovdqa64 %ymm25, %ymm0 8582; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm6 8583; AVX512-NEXT: vpshufb %ymm11, %ymm8, %ymm7 8584; AVX512-NEXT: vmovdqa64 %ymm8, %ymm16 8585; AVX512-NEXT: vpor %ymm6, %ymm7, %ymm0 8586; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8587; AVX512-NEXT: vmovdqa 32(%rdx), %xmm0 8588; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8589; AVX512-NEXT: vmovdqa 32(%rcx), %xmm1 8590; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8591; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 8592; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm6 8593; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 8594; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm7 8595; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21 8596; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm0 8597; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8598; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 8599; AVX512-NEXT: vmovdqa 32(%rsi), %xmm15 8600; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 8601; AVX512-NEXT: vpshufb %xmm6, %xmm15, %xmm7 8602; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 8603; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm8 8604; AVX512-NEXT: vmovdqa64 %xmm0, %xmm30 8605; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm0 8606; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8607; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] 8608; AVX512-NEXT: vmovdqa 32(%rax), %xmm0 8609; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8610; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] 8611; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 8612; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm28 8613; AVX512-NEXT: vmovdqa 32(%r9), %xmm0 8614; AVX512-NEXT: vmovdqa 32(%r8), %xmm14 8615; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 8616; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm7 8617; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24 8618; AVX512-NEXT: vmovdqa64 %xmm0, %xmm18 8619; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 8620; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm8 8621; AVX512-NEXT: vmovdqa64 %xmm0, %xmm29 8622; AVX512-NEXT: vporq %xmm7, %xmm8, %xmm31 8623; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 8624; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm7 8625; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 8626; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm8 8627; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm0 8628; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8629; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 8630; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 8631; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm0 8632; AVX512-NEXT: vmovdqa64 %ymm1, %ymm23 8633; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] 8634; AVX512-NEXT: # ymm7 = mem[0,1,0,1] 8635; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm1 8636; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 8637; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8638; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] 8639; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero 8640; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 8641; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8642; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] 8643; AVX512-NEXT: # ymm0 = mem[0,1,0,1] 8644; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm1 8645; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] 8646; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm4 8647; AVX512-NEXT: vpor %ymm1, %ymm4, %ymm1 8648; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8649; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 8650; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 8651; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm4 8652; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] 8653; AVX512-NEXT: # ymm9 = mem[0,1,0,1] 8654; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm5 8655; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4 8656; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8657; AVX512-NEXT: vmovdqa64 %ymm27, %ymm4 8658; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 8659; AVX512-NEXT: vmovdqa64 %ymm17, %ymm4 8660; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 8661; AVX512-NEXT: vpor %ymm3, %ymm2, %ymm2 8662; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8663; AVX512-NEXT: vmovdqa (%rsi), %xmm3 8664; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm2 8665; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 8666; AVX512-NEXT: vmovdqa (%rdi), %xmm4 8667; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm3 8668; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27 8669; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 8670; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8671; AVX512-NEXT: vmovdqa (%rcx), %xmm3 8672; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm2 8673; AVX512-NEXT: vmovdqa64 %xmm3, %xmm17 8674; AVX512-NEXT: vmovdqa (%rdx), %xmm13 8675; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3 8676; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3 8677; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 8678; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8679; AVX512-NEXT: vmovdqa (%r9), %xmm11 8680; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 8681; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm2 8682; AVX512-NEXT: vmovdqa (%r8), %xmm10 8683; AVX512-NEXT: vmovdqa64 %xmm29, %xmm3 8684; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm3 8685; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 8686; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8687; AVX512-NEXT: vmovdqa64 %ymm20, %ymm4 8688; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25] 8689; AVX512-NEXT: vmovdqa64 %ymm22, %ymm3 8690; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 8691; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8692; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8693; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero 8694; AVX512-NEXT: vpshufb %ymm8, %ymm4, %ymm2 8695; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm29 8696; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2 8697; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero 8698; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3 8699; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 8700; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm26 8701; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] 8702; AVX512-NEXT: vpshufb %ymm7, %ymm12, %ymm2 8703; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm23 8704; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 8705; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 8706; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 8707; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8708; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8709; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8710; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] 8711; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm1 8712; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8713; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8714; AVX512-NEXT: vmovdqa (%rax), %ymm8 8715; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 8716; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] 8717; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 8718; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 8719; AVX512-NEXT: vmovdqa (%rax), %xmm5 8720; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6] 8721; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 8722; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 8723; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 8724; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] 8725; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm3 8726; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 8727; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4 8728; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] 8729; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 8730; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3 8731; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 8732; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8733; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1] 8734; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8735; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8736; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 8737; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 8738; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3 8739; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21 8740; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8741; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1] 8742; AVX512-NEXT: vmovdqa64 %xmm18, %xmm2 8743; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] 8744; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 8745; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm12 8746; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 8747; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8748; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1] 8749; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 8750; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18 8751; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] 8752; AVX512-NEXT: vmovdqa64 %xmm17, %xmm3 8753; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] 8754; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 8755; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm12 8756; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8757; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 8758; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm8 8759; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] 8760; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 8761; AVX512-NEXT: vmovdqa64 %xmm27, %xmm7 8762; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] 8763; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 8764; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm15 8765; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 8766; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 8767; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 8768; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] 8769; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] 8770; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 8771; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm14 8772; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 8773; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm31, %zmm31 8774; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 8775; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm1 8776; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 8777; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] 8778; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 8779; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 8780; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm9 8781; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 8782; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm6 ^ (zmm1 & (zmm22 ^ zmm6)) 8783; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 8784; AVX512-NEXT: # ymm6 = mem[2,3,2,3] 8785; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8786; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 8787; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload 8788; AVX512-NEXT: # ymm27 = mem[2,3,2,3] 8789; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 8790; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 8791; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm6 ^ (zmm1 & (zmm27 ^ zmm6)) 8792; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 8793; AVX512-NEXT: vmovdqa64 %xmm24, %xmm0 8794; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 8795; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 8796; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] 8797; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0 8798; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm6 8799; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload 8800; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 8801; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 8802; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm10 8803; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload 8804; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 8805; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7] 8806; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 8807; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 8808; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 8809; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload 8810; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 8811; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload 8812; AVX512-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] 8813; AVX512-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7] 8814; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11)) 8815; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 8816; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload 8817; AVX512-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] 8818; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] 8819; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm21 & (zmm11 ^ zmm13)) 8820; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 8821; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5] 8822; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm21 & (zmm7 ^ zmm8)) 8823; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1] 8824; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] 8825; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] 8826; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3] 8827; AVX512-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 8828; AVX512-NEXT: # xmm15 = mem[1,1,0,0,4,5,6,7] 8829; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] 8830; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload 8831; AVX512-NEXT: vporq %zmm26, %zmm23, %zmm17 8832; AVX512-NEXT: vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7] 8833; AVX512-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7] 8834; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm18 ^ (zmm21 & (zmm17 ^ zmm18)) 8835; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload 8836; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload 8837; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm8)) 8838; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload 8839; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm8 & mem) 8840; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12)) 8841; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload 8842; AVX512-NEXT: # zmm8 = mem[2,3,2,3,6,7,6,7] 8843; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm11)) 8844; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0] 8845; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 8846; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm30)) 8847; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm22)) 8848; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5] 8849; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm28 ^ (mem & (zmm0 ^ zmm28)) 8850; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7)) 8851; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8852; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload 8853; AVX512-NEXT: # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3] 8854; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm3 & mem) 8855; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm27)) 8856; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 8857; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5] 8858; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) 8859; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5] 8860; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4] 8861; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm1)) 8862; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3)) 8863; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 8864; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload 8865; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 8866; AVX512-NEXT: vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1)) 8867; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17)) 8868; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 8869; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rax) 8870; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) 8871; AVX512-NEXT: vmovdqa64 %zmm9, 320(%rax) 8872; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rax) 8873; AVX512-NEXT: vmovdqa64 %zmm11, 192(%rax) 8874; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rax) 8875; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rax) 8876; AVX512-NEXT: addq $1384, %rsp # imm = 0x568 8877; AVX512-NEXT: vzeroupper 8878; AVX512-NEXT: retq 8879; 8880; AVX512-FCP-LABEL: store_i8_stride7_vf64: 8881; AVX512-FCP: # %bb.0: 8882; AVX512-FCP-NEXT: subq $1432, %rsp # imm = 0x598 8883; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 8884; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero 8885; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 8886; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8887; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] 8888; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 8889; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 8890; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 8891; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8892; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8893; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29] 8894; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] 8895; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] 8896; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 8897; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 8898; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 8899; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8900; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8901; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 8902; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero 8903; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 8904; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] 8905; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] 8906; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm1 8907; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 8908; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8909; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8910; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u] 8911; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] 8912; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] 8913; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 8914; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 8915; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 8916; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8917; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8918; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 8919; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] 8920; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 8921; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] 8922; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 8923; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm1 8924; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 8925; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8926; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8927; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero 8928; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 8929; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] 8930; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] 8931; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 8932; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 8933; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 8934; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8935; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8936; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8937; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 8938; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8939; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 8940; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 8941; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8942; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8943; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 8944; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] 8945; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 8946; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero 8947; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 8948; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8949; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3 8950; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero 8951; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 8952; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31] 8953; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 8954; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8955; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 8956; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero 8957; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm14 8958; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u] 8959; AVX512-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 8960; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8961; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 8962; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8963; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 8964; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8965; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 8966; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm7 8967; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 8968; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 8969; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm0 8970; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8971; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 8972; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 8973; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8974; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 8975; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm9 8976; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 8977; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 8978; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 8979; AVX512-FCP-NEXT: vpor %xmm9, %xmm11, %xmm0 8980; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8981; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9 8982; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8983; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm0 8984; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8985; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 8986; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 8987; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 8988; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm12 8989; AVX512-FCP-NEXT: vpor %xmm9, %xmm12, %xmm0 8990; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8991; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 8992; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 8993; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 8994; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 8995; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 8996; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 8997; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm0 8998; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8999; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm10 9000; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 9001; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9 9002; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8 9003; AVX512-FCP-NEXT: vpor %xmm4, %xmm8, %xmm0 9004; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9005; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm8 9006; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 9007; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm7 9008; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm13 9009; AVX512-FCP-NEXT: vporq %xmm11, %xmm13, %xmm30 9010; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 9011; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 9012; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] 9013; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] 9014; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm13 9015; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9016; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9017; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 9018; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm11 9019; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] 9020; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9021; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm13 9022; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 9023; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9024; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9025; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 9026; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm11 9027; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 9028; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9029; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm13 9030; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 9031; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9032; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9033; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 9034; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm11 9035; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] 9036; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 9037; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm13 9038; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm25 9039; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 9040; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm11 9041; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 9042; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9043; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm13 9044; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 9045; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9046; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9047; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 9048; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm11 9049; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] 9050; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9051; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm13 9052; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 9053; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9054; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9055; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero 9056; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29] 9057; AVX512-FCP-NEXT: vporq %ymm6, %ymm5, %ymm20 9058; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero 9059; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u] 9060; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm0 9061; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9062; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] 9063; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero 9064; AVX512-FCP-NEXT: vporq %ymm1, %ymm0, %ymm21 9065; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero 9066; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 9067; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31] 9068; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9069; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9070; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 9071; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 9072; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 9073; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9074; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9075; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 9076; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] 9077; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9078; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero 9079; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9080; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9081; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm0 9082; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 9083; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 9084; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm29 9085; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 9086; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 9087; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 9088; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 9089; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 9090; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 9091; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9092; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9093; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero 9094; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] 9095; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9096; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9097; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm1 9098; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] 9099; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] 9100; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6] 9101; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm23 9102; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 9103; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 9104; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm15 9105; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero 9106; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 9107; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 9108; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 9109; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 9110; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 9111; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9112; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload 9113; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm1 9114; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 9115; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 9116; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9117; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm19 9118; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm4 9119; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1 9120; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 9121; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 9122; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9123; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 9124; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9125; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm5 9126; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] 9127; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9128; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9129; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm17 9130; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm6 9131; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6] 9132; AVX512-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm31 9133; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] 9134; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 9135; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm0 9136; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 9137; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9138; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9139; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 9140; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 9141; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9142; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm3, %zmm28 9143; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 9144; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 9145; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 9146; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload 9147; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 9148; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 9149; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 9150; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 9151; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 9152; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 9153; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 9154; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm30, %zmm7, %zmm7 9155; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] 9156; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 9157; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1] 9158; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 9159; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload 9160; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 9161; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload 9162; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] 9163; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] 9164; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm8)) 9165; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 9166; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload 9167; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7] 9168; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] 9169; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm8 & (zmm30 ^ zmm20)) 9170; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5] 9171; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5] 9172; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm5 ^ (zmm8 & (zmm20 ^ zmm5)) 9173; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9174; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload 9175; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload 9176; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7] 9177; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7] 9178; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm25 ^ (zmm8 & (zmm1 ^ zmm25)) 9179; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm4 9180; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 9181; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 9182; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9183; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1] 9184; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 9185; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 9186; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 9187; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm14 ^ (zmm8 & (zmm10 ^ zmm14)) 9188; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1] 9189; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 9190; AVX512-FCP-NEXT: # ymm11 = mem[2,3,2,3] 9191; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9192; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 9193; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3] 9194; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9195; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 9196; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm11 ^ (zmm8 & (zmm14 ^ zmm11)) 9197; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 9198; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] 9199; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] 9200; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm16 9201; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 9202; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm12 9203; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 9204; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7] 9205; AVX512-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm2 9206; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 9207; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 9208; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 9209; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5] 9210; AVX512-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15 9211; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9212; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero 9213; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 9214; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 9215; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1] 9216; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1] 9217; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1] 9218; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] 9219; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0] 9220; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 9221; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 9222; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 9223; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 9224; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload 9225; AVX512-FCP-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] 9226; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm30)) 9227; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload 9228; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload 9229; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm6)) 9230; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload 9231; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm3 & mem) 9232; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17)) 9233; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm5, %zmm3 9234; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5] 9235; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3)) 9236; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm20)) 9237; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5] 9238; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5] 9239; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3)) 9240; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm3 9241; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5] 9242; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (mem & (zmm4 ^ zmm3)) 9243; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6)) 9244; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9245; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload 9246; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm6 9247; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] 9248; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3)) 9249; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm1)) 9250; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 9251; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21)) 9252; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm10)) 9253; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9254; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload 9255; AVX512-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3] 9256; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm1 & mem) 9257; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm14)) 9258; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9259; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 320(%rax) 9260; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 9261; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 9262; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 9263; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) 9264; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) 9265; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) 9266; AVX512-FCP-NEXT: addq $1432, %rsp # imm = 0x598 9267; AVX512-FCP-NEXT: vzeroupper 9268; AVX512-FCP-NEXT: retq 9269; 9270; AVX512DQ-LABEL: store_i8_stride7_vf64: 9271; AVX512DQ: # %bb.0: 9272; AVX512DQ-NEXT: subq $1384, %rsp # imm = 0x568 9273; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm7 9274; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] 9275; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 9276; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero 9277; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20 9278; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 9279; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9280; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm15 9281; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 9282; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm0 9283; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm23 9284; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 9285; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] 9286; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm1 9287; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26 9288; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm18 9289; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 9290; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9291; AVX512DQ-NEXT: vmovdqa (%r8), %ymm14 9292; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] 9293; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm0 9294; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm27 9295; AVX512DQ-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9296; AVX512DQ-NEXT: vmovdqa (%r9), %ymm8 9297; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] 9298; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] 9299; AVX512DQ-NEXT: vpshufb %ymm3, %ymm8, %ymm1 9300; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm17 9301; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 9302; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9303; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10 9304; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] 9305; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] 9306; AVX512DQ-NEXT: vpshufb %ymm6, %ymm10, %ymm0 9307; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9 9308; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero,ymm9[25] 9309; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9310; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9311; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] 9312; AVX512DQ-NEXT: vpshufb %ymm1, %ymm9, %ymm0 9313; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 9314; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero 9315; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9316; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9317; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm5 9318; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] 9319; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] 9320; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm2 9321; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4 9322; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero 9323; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 9324; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9325; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] 9326; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] 9327; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm2 9328; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21 9329; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] 9330; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 9331; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9332; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm3 9333; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] 9334; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] 9335; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm11 9336; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm25 9337; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm2 9338; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 9339; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 9340; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9341; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] 9342; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] 9343; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm12 9344; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31] 9345; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 9346; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9347; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 9348; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm0 9349; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9350; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 9351; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 9352; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 9353; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9354; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm6 9355; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm22 9356; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7 9357; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0 9358; AVX512DQ-NEXT: vpshufb %ymm0, %ymm7, %ymm7 9359; AVX512DQ-NEXT: vpor %ymm6, %ymm7, %ymm0 9360; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9361; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm12 9362; AVX512DQ-NEXT: vpshufb %ymm1, %ymm12, %ymm6 9363; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0 9364; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm7 9365; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm19 9366; AVX512DQ-NEXT: vpor %ymm6, %ymm7, %ymm0 9367; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9368; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0 9369; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm6 9370; AVX512DQ-NEXT: vpshufb %ymm11, %ymm8, %ymm7 9371; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm16 9372; AVX512DQ-NEXT: vpor %ymm6, %ymm7, %ymm0 9373; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9374; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm0 9375; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9376; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm1 9377; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9378; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 9379; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm6 9380; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 9381; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm7 9382; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21 9383; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm0 9384; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9385; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 9386; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm15 9387; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 9388; AVX512DQ-NEXT: vpshufb %xmm6, %xmm15, %xmm7 9389; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 9390; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm8 9391; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm30 9392; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm0 9393; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9394; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] 9395; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm0 9396; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9397; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] 9398; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 9399; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm28 9400; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm0 9401; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm14 9402; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 9403; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm7 9404; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24 9405; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm18 9406; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 9407; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm8 9408; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm29 9409; AVX512DQ-NEXT: vporq %xmm7, %xmm8, %xmm31 9410; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0 9411; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm7 9412; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 9413; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm8 9414; AVX512DQ-NEXT: vpor %ymm7, %ymm8, %ymm0 9415; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9416; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 9417; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] 9418; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm0 9419; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23 9420; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] 9421; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] 9422; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm1 9423; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 9424; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9425; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] 9426; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero 9427; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 9428; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9429; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] 9430; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] 9431; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm1 9432; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] 9433; AVX512DQ-NEXT: vpshufb %ymm8, %ymm9, %ymm4 9434; AVX512DQ-NEXT: vpor %ymm1, %ymm4, %ymm1 9435; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9436; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 9437; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] 9438; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm4 9439; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] 9440; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] 9441; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm5 9442; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4 9443; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9444; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm4 9445; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 9446; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm4 9447; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 9448; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 9449; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9450; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 9451; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm2 9452; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 9453; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 9454; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm3 9455; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm27 9456; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 9457; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9458; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 9459; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm2 9460; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm17 9461; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13 9462; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 9463; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm3 9464; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 9465; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9466; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 9467; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 9468; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm2 9469; AVX512DQ-NEXT: vmovdqa (%r8), %xmm10 9470; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm3 9471; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm3 9472; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 9473; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9474; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm4 9475; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25] 9476; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm3 9477; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 9478; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 9479; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9480; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero 9481; AVX512DQ-NEXT: vpshufb %ymm8, %ymm4, %ymm2 9482; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm29 9483; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 9484; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero 9485; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm3 9486; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 9487; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm26 9488; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] 9489; AVX512DQ-NEXT: vpshufb %ymm7, %ymm12, %ymm2 9490; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm23 9491; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2 9492; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero 9493; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 9494; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9495; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9496; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9497; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] 9498; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm1 9499; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9500; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9501; AVX512DQ-NEXT: vmovdqa (%rax), %ymm8 9502; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 9503; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] 9504; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 9505; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 9506; AVX512DQ-NEXT: vmovdqa (%rax), %xmm5 9507; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6] 9508; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 9509; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] 9510; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 9511; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] 9512; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm3 9513; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 9514; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4 9515; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] 9516; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 9517; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 9518; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 9519; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9520; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1] 9521; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9522; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9523; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 9524; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 9525; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 9526; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21 9527; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9528; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1] 9529; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2 9530; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] 9531; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 9532; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm12 9533; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 9534; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9535; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1] 9536; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 9537; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18 9538; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] 9539; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm3 9540; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] 9541; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 9542; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm12 9543; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 9544; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9545; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm8 9546; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] 9547; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 9548; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm7 9549; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] 9550; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 9551; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm15 9552; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 9553; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9554; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 9555; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] 9556; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] 9557; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 9558; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm14 9559; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 9560; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm31, %zmm31 9561; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 9562; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm1 9563; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 9564; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] 9565; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] 9566; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 9567; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm9 9568; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 9569; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm6 ^ (zmm1 & (zmm22 ^ zmm6)) 9570; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 9571; AVX512DQ-NEXT: # ymm6 = mem[2,3,2,3] 9572; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9573; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 9574; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload 9575; AVX512DQ-NEXT: # ymm27 = mem[2,3,2,3] 9576; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9577; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 9578; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm6 ^ (zmm1 & (zmm27 ^ zmm6)) 9579; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 9580; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0 9581; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 9582; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 9583; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] 9584; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm0 9585; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6 9586; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload 9587; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 9588; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm0 9589; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm10 9590; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload 9591; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 9592; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7] 9593; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 9594; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 9595; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 9596; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload 9597; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 9598; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload 9599; AVX512DQ-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] 9600; AVX512DQ-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7] 9601; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11)) 9602; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload 9603; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload 9604; AVX512DQ-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] 9605; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] 9606; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm21 & (zmm11 ^ zmm13)) 9607; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] 9608; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5] 9609; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm21 & (zmm7 ^ zmm8)) 9610; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1] 9611; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] 9612; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] 9613; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3] 9614; AVX512DQ-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 9615; AVX512DQ-NEXT: # xmm15 = mem[1,1,0,0,4,5,6,7] 9616; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] 9617; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload 9618; AVX512DQ-NEXT: vporq %zmm26, %zmm23, %zmm17 9619; AVX512DQ-NEXT: vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7] 9620; AVX512DQ-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7] 9621; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm18 ^ (zmm21 & (zmm17 ^ zmm18)) 9622; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload 9623; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload 9624; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm8)) 9625; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload 9626; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm8 & mem) 9627; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12)) 9628; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload 9629; AVX512DQ-NEXT: # zmm8 = mem[2,3,2,3,6,7,6,7] 9630; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm11)) 9631; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0] 9632; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 9633; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm30)) 9634; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm22)) 9635; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5] 9636; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm28 ^ (mem & (zmm0 ^ zmm28)) 9637; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7)) 9638; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9639; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload 9640; AVX512DQ-NEXT: # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3] 9641; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm3 & mem) 9642; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm27)) 9643; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 9644; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5] 9645; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) 9646; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5] 9647; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4] 9648; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm1)) 9649; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3)) 9650; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9651; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload 9652; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 9653; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1)) 9654; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17)) 9655; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 9656; AVX512DQ-NEXT: vmovdqa64 %zmm16, 128(%rax) 9657; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) 9658; AVX512DQ-NEXT: vmovdqa64 %zmm9, 320(%rax) 9659; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) 9660; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rax) 9661; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rax) 9662; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rax) 9663; AVX512DQ-NEXT: addq $1384, %rsp # imm = 0x568 9664; AVX512DQ-NEXT: vzeroupper 9665; AVX512DQ-NEXT: retq 9666; 9667; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64: 9668; AVX512DQ-FCP: # %bb.0: 9669; AVX512DQ-FCP-NEXT: subq $1432, %rsp # imm = 0x598 9670; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 9671; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero 9672; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 9673; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9674; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] 9675; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 9676; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 9677; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 9678; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9679; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9680; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29] 9681; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] 9682; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] 9683; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 9684; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 9685; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 9686; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9687; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9688; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 9689; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero 9690; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 9691; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] 9692; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] 9693; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm1 9694; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 9695; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9696; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9697; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u] 9698; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] 9699; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] 9700; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 9701; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 9702; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 9703; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9704; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9705; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 9706; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] 9707; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3 9708; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] 9709; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 9710; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm1 9711; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 9712; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9713; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9714; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero 9715; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 9716; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] 9717; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] 9718; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 9719; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 9720; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 9721; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9722; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9723; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9724; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 9725; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9726; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 9727; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 9728; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 9729; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9730; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 9731; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] 9732; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 9733; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero 9734; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9735; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9736; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3 9737; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero 9738; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 9739; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31] 9740; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9741; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9742; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 9743; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero 9744; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm14 9745; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u] 9746; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 9747; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9748; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 9749; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9750; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 9751; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9752; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 9753; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm7 9754; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 9755; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 9756; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm9, %xmm0 9757; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9758; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 9759; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 9760; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9761; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 9762; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm9 9763; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 9764; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 9765; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 9766; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm11, %xmm0 9767; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9768; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm9 9769; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9770; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm0 9771; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9772; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 9773; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 9774; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 9775; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm12 9776; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm0 9777; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9778; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 9779; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 9780; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 9781; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 9782; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 9783; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 9784; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm0 9785; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9786; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm10 9787; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 9788; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm9 9789; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8 9790; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm8, %xmm0 9791; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9792; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm8 9793; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 9794; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm7 9795; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm13 9796; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm13, %xmm30 9797; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 9798; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 9799; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] 9800; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] 9801; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm13 9802; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9803; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9804; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 9805; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm11 9806; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] 9807; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9808; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm13 9809; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 9810; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9811; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9812; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 9813; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm11 9814; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] 9815; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9816; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm13 9817; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 9818; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9819; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9820; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 9821; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm11 9822; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] 9823; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 9824; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm13 9825; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm25 9826; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 9827; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm11 9828; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] 9829; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9830; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm13 9831; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 9832; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9833; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9834; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 9835; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm11 9836; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] 9837; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] 9838; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm13 9839; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 9840; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm0 9841; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9842; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero 9843; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29] 9844; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm5, %ymm20 9845; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero 9846; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u] 9847; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm0 9848; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9849; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] 9850; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero 9851; AVX512DQ-FCP-NEXT: vporq %ymm1, %ymm0, %ymm21 9852; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero 9853; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 9854; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31] 9855; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9856; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9857; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 9858; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 9859; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 9860; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9861; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9862; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 9863; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] 9864; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9865; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero 9866; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9867; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9868; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm0 9869; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 9870; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 9871; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm29 9872; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 9873; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 9874; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 9875; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 9876; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 9877; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 9878; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9879; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9880; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero 9881; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] 9882; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 9883; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9884; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm1 9885; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] 9886; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] 9887; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6] 9888; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm23 9889; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 9890; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 9891; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm15 9892; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero 9893; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 9894; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 9895; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 9896; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 9897; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 9898; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9899; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload 9900; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm1 9901; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 9902; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 9903; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9904; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm19 9905; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm4 9906; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1 9907; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 9908; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 9909; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9910; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 9911; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9912; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm5 9913; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] 9914; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9915; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9916; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm17 9917; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm6 9918; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6] 9919; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm31 9920; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] 9921; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 9922; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm0 9923; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 9924; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9925; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9926; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 9927; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 9928; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9929; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm3, %zmm28 9930; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 9931; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 9932; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 9933; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload 9934; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 9935; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 9936; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 9937; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload 9938; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 9939; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 9940; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 9941; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm30, %zmm7, %zmm7 9942; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] 9943; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 9944; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1] 9945; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 9946; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload 9947; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload 9948; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload 9949; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] 9950; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] 9951; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm8)) 9952; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload 9953; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload 9954; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7] 9955; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] 9956; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm8 & (zmm30 ^ zmm20)) 9957; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5] 9958; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5] 9959; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm5 ^ (zmm8 & (zmm20 ^ zmm5)) 9960; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 9961; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload 9962; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload 9963; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7] 9964; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7] 9965; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm25 ^ (zmm8 & (zmm1 ^ zmm25)) 9966; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm4 9967; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 9968; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 9969; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9970; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1] 9971; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 9972; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 9973; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] 9974; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm14 ^ (zmm8 & (zmm10 ^ zmm14)) 9975; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1] 9976; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload 9977; AVX512DQ-FCP-NEXT: # ymm11 = mem[2,3,2,3] 9978; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9979; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 9980; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3] 9981; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 9982; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 9983; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm11 ^ (zmm8 & (zmm14 ^ zmm11)) 9984; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 9985; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] 9986; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] 9987; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm16 9988; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] 9989; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm12 9990; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 9991; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7] 9992; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm2 9993; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] 9994; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] 9995; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 9996; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5] 9997; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15 9998; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9999; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero 10000; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] 10001; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 10002; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1] 10003; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1] 10004; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1] 10005; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] 10006; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0] 10007; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] 10008; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 10009; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 10010; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 10011; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload 10012; AVX512DQ-FCP-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] 10013; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm30)) 10014; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload 10015; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload 10016; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm6)) 10017; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload 10018; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm3 & mem) 10019; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17)) 10020; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm5, %zmm3 10021; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5] 10022; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3)) 10023; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm20)) 10024; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5] 10025; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5] 10026; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3)) 10027; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm3 10028; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5] 10029; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (mem & (zmm4 ^ zmm3)) 10030; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6)) 10031; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 10032; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload 10033; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm6 10034; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] 10035; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3)) 10036; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm1)) 10037; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 10038; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21)) 10039; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm10)) 10040; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload 10041; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload 10042; AVX512DQ-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3] 10043; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm1 & mem) 10044; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm14)) 10045; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10046; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 320(%rax) 10047; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) 10048; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) 10049; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 10050; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) 10051; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) 10052; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) 10053; AVX512DQ-FCP-NEXT: addq $1432, %rsp # imm = 0x598 10054; AVX512DQ-FCP-NEXT: vzeroupper 10055; AVX512DQ-FCP-NEXT: retq 10056; 10057; AVX512BW-LABEL: store_i8_stride7_vf64: 10058; AVX512BW: # %bb.0: 10059; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10060; AVX512BW-NEXT: vmovdqa (%rax), %ymm13 10061; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 10062; AVX512BW-NEXT: vpshufb %ymm26, %ymm13, %ymm0 10063; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 10064; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] 10065; AVX512BW-NEXT: vpermw %ymm13, %ymm1, %ymm1 10066; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 10067; AVX512BW-NEXT: vmovdqa (%r9), %ymm9 10068; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 10069; AVX512BW-NEXT: vpshufb %ymm17, %ymm9, %ymm1 10070; AVX512BW-NEXT: vmovdqa (%r8), %ymm10 10071; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] 10072; AVX512BW-NEXT: vpshufb %ymm21, %ymm10, %ymm2 10073; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm2 10074; AVX512BW-NEXT: vmovdqa (%r9), %xmm1 10075; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10076; AVX512BW-NEXT: vmovdqa (%r8), %xmm12 10077; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 10078; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10079; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 10080; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm8 10081; AVX512BW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 10082; AVX512BW-NEXT: kmovq %r10, %k1 10083; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm8 {%k1} 10084; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 10085; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] 10086; AVX512BW-NEXT: vpshufb %ymm0, %ymm14, %ymm2 10087; AVX512BW-NEXT: vmovdqa (%rcx), %ymm15 10088; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 10089; AVX512BW-NEXT: vpshufb %ymm20, %ymm15, %ymm4 10090; AVX512BW-NEXT: vpor %ymm2, %ymm4, %ymm2 10091; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 10092; AVX512BW-NEXT: vmovdqa (%rcx), %xmm5 10093; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 10094; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 10095; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 10096; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm22 10097; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm18 10098; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] 10099; AVX512BW-NEXT: vpshufb %ymm24, %ymm18, %ymm2 10100; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm19 10101; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 10102; AVX512BW-NEXT: vpshufb %ymm25, %ymm19, %ymm6 10103; AVX512BW-NEXT: vpor %ymm2, %ymm6, %ymm2 10104; AVX512BW-NEXT: vmovdqa (%rdi), %xmm6 10105; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 10106; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 10107; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm23[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 10108; AVX512BW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,0,1] 10109; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm23, %zmm3 10110; AVX512BW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 10111; AVX512BW-NEXT: kmovq %r10, %k1 10112; AVX512BW-NEXT: vmovdqu8 %zmm22, %zmm3 {%k1} 10113; AVX512BW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 10114; AVX512BW-NEXT: kmovq %r10, %k1 10115; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm3 {%k1} 10116; AVX512BW-NEXT: vmovdqa64 32(%rdx), %ymm29 10117; AVX512BW-NEXT: vpshufb %ymm0, %ymm29, %ymm0 10118; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm30 10119; AVX512BW-NEXT: vpshufb %ymm20, %ymm30, %ymm8 10120; AVX512BW-NEXT: vpor %ymm0, %ymm8, %ymm0 10121; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] 10122; AVX512BW-NEXT: vpshufb %ymm20, %ymm29, %ymm8 10123; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] 10124; AVX512BW-NEXT: vpshufb %ymm22, %ymm30, %ymm23 10125; AVX512BW-NEXT: vporq %ymm8, %ymm23, %ymm8 10126; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 10127; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 10128; AVX512BW-NEXT: vmovdqa64 32(%rsi), %ymm28 10129; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm16 10130; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm16[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 10131; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5] 10132; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm23 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] 10133; AVX512BW-NEXT: movl $676341840, %r10d # imm = 0x28502850 10134; AVX512BW-NEXT: kmovd %r10d, %k1 10135; AVX512BW-NEXT: vpshufb %ymm23, %ymm28, %ymm8 {%k1} 10136; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 10137; AVX512BW-NEXT: vpshufb %ymm24, %ymm16, %ymm24 10138; AVX512BW-NEXT: vpshufb %ymm25, %ymm28, %ymm25 10139; AVX512BW-NEXT: vporq %ymm24, %ymm25, %ymm24 10140; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm24, %zmm8 10141; AVX512BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 10142; AVX512BW-NEXT: kmovq %r10, %k2 10143; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2} 10144; AVX512BW-NEXT: vmovdqa64 32(%r9), %ymm31 10145; AVX512BW-NEXT: vpshufb %ymm17, %ymm31, %ymm17 10146; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm1 10147; AVX512BW-NEXT: vpshufb %ymm21, %ymm1, %ymm21 10148; AVX512BW-NEXT: vporq %ymm17, %ymm21, %ymm17 10149; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] 10150; AVX512BW-NEXT: vpshufb %ymm24, %ymm1, %ymm21 10151; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] 10152; AVX512BW-NEXT: vpshufb %ymm25, %ymm31, %ymm27 10153; AVX512BW-NEXT: vporq %ymm21, %ymm27, %ymm21 10154; AVX512BW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] 10155; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm17, %zmm21 10156; AVX512BW-NEXT: vmovdqa64 32(%rax), %ymm17 10157; AVX512BW-NEXT: vpshufb %ymm26, %ymm17, %ymm27 10158; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 10159; AVX512BW-NEXT: vpermw %ymm17, %ymm26, %ymm11 10160; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 10161; AVX512BW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 10162; AVX512BW-NEXT: kmovq %r10, %k3 10163; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm21 {%k3} 10164; AVX512BW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 10165; AVX512BW-NEXT: kmovq %r10, %k3 10166; AVX512BW-NEXT: vmovdqu8 %zmm21, %zmm8 {%k3} 10167; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm16[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 10168; AVX512BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] 10169; AVX512BW-NEXT: movl $338170920, %r10d # imm = 0x14281428 10170; AVX512BW-NEXT: kmovd %r10d, %k4 10171; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm27 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] 10172; AVX512BW-NEXT: vpshufb %ymm27, %ymm28, %ymm11 {%k4} 10173; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] 10174; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 10175; AVX512BW-NEXT: vpshufb %ymm2, %ymm28, %ymm21 10176; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] 10177; AVX512BW-NEXT: vpshufb %ymm28, %ymm16, %ymm16 10178; AVX512BW-NEXT: vporq %ymm21, %ymm16, %ymm16 10179; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm16 10180; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm29[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 10181; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm11[0,2,3,3,4,6,7,7] 10182; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] 10183; AVX512BW-NEXT: vpshufb %ymm11, %ymm30, %ymm0 10184; AVX512BW-NEXT: vmovdqu8 %ymm21, %ymm0 {%k1} 10185; AVX512BW-NEXT: vpshufb {{.*#+}} ymm21 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm29[24,25],zero,ymm29[23],zero,ymm29[21,22,23,26],zero,ymm29[24],zero,ymm29[28,29,26,27] 10186; AVX512BW-NEXT: vpshufb {{.*#+}} ymm29 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero,zero,zero 10187; AVX512BW-NEXT: vporq %ymm21, %ymm29, %ymm21 10188; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 10189; AVX512BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] 10190; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] 10191; AVX512BW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 10192; AVX512BW-NEXT: kmovq %r10, %k3 10193; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k3} 10194; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 10195; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7] 10196; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero 10197; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm29 10198; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm31[0,1,2,3],zmm29[4,5,6,7] 10199; AVX512BW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61] 10200; AVX512BW-NEXT: vporq %zmm1, %zmm21, %zmm1 10201; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10202; AVX512BW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 10203; AVX512BW-NEXT: kmovq %r10, %k3 10204; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k3} 10205; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm21 10206; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] 10207; AVX512BW-NEXT: vpermi2w %zmm21, %zmm17, %zmm1 10208; AVX512BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 10209; AVX512BW-NEXT: kmovq %rax, %k5 10210; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k5} 10211; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 10212; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] 10213; AVX512BW-NEXT: vpshufb %ymm23, %ymm19, %ymm1 {%k1} 10214; AVX512BW-NEXT: vpshufb %ymm2, %ymm19, %ymm2 10215; AVX512BW-NEXT: vpshufb %ymm28, %ymm18, %ymm23 10216; AVX512BW-NEXT: vporq %ymm2, %ymm23, %ymm2 10217; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 10218; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 10219; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 10220; AVX512BW-NEXT: vpshufb %zmm20, %zmm2, %zmm2 10221; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm20 10222; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm20, %zmm20 10223; AVX512BW-NEXT: vpshufb %zmm22, %zmm20, %zmm20 10224; AVX512BW-NEXT: vporq %zmm2, %zmm20, %zmm2 10225; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10226; AVX512BW-NEXT: vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7] 10227; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm20 {%k3} 10228; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm22 10229; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm29, %zmm1 10230; AVX512BW-NEXT: vpshufb %zmm24, %zmm1, %zmm1 10231; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm23 10232; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 10233; AVX512BW-NEXT: vpshufb %zmm25, %zmm0, %zmm2 10234; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm0 10235; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 10236; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm2 10237; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10238; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm21, %zmm13 10239; AVX512BW-NEXT: vpermw %zmm13, %zmm26, %zmm24 10240; AVX512BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 10241; AVX512BW-NEXT: kmovq %rax, %k5 10242; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm1 {%k5} 10243; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm24 10244; AVX512BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C 10245; AVX512BW-NEXT: kmovq %rax, %k5 10246; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm20 {%k5} 10247; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm1 10248; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 10249; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm18[2,2,3,3,6,6,7,7] 10250; AVX512BW-NEXT: vpshufb %ymm27, %ymm19, %ymm25 {%k4} 10251; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm24[0],xmm1[0],xmm24[1],xmm1[1],xmm24[2],xmm1[2],xmm24[3],xmm1[3],xmm24[4],xmm1[4],xmm24[5],xmm1[5],xmm24[6],xmm1[6],xmm24[7],xmm1[7] 10252; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 10253; AVX512BW-NEXT: vpshufb %xmm18, %xmm19, %xmm19 10254; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm25[2,3,2,3],zmm19[0,1,0,1] 10255; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 10256; AVX512BW-NEXT: vpshufb %ymm11, %ymm15, %ymm11 10257; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 10258; AVX512BW-NEXT: vpshufb %xmm19, %xmm26, %xmm15 10259; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 10260; AVX512BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,3,3,4,6,7,7] 10261; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k1} 10262; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm11[2,3,2,3],zmm15[0,1,0,1] 10263; AVX512BW-NEXT: vmovdqu8 %zmm25, %zmm14 {%k2} 10264; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] 10265; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 10266; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm11 10267; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero 10268; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29] 10269; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9 10270; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[2,3,2,3],zmm11[0,1,0,1] 10271; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] 10272; AVX512BW-NEXT: vpermw %zmm21, %zmm10, %zmm10 10273; AVX512BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 10274; AVX512BW-NEXT: kmovq %rax, %k1 10275; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k1} 10276; AVX512BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E 10277; AVX512BW-NEXT: kmovq %rax, %k1 10278; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm14 {%k1} 10279; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 10280; AVX512BW-NEXT: vpshufb %xmm10, %xmm2, %xmm9 10281; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 10282; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm25 10283; AVX512BW-NEXT: vporq %xmm9, %xmm25, %xmm9 10284; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 10285; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 10286; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm0 10287; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 10288; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm9 10289; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 10290; AVX512BW-NEXT: vpshufb %xmm25, %xmm24, %xmm26 10291; AVX512BW-NEXT: vporq %xmm9, %xmm26, %xmm9 10292; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm24[8],xmm1[9],xmm24[9],xmm1[10],xmm24[10],xmm1[11],xmm24[11],xmm1[12],xmm24[12],xmm1[13],xmm24[13],xmm1[14],xmm24[14],xmm1[15],xmm24[15] 10293; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] 10294; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 10295; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 10296; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,1,0,1,4,5,4,5] 10297; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm9 {%k3} 10298; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 10299; AVX512BW-NEXT: vpshufb %xmm0, %xmm22, %xmm1 10300; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] 10301; AVX512BW-NEXT: vpermi2w %zmm21, %zmm17, %zmm24 10302; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 10303; AVX512BW-NEXT: vpshufb %xmm17, %xmm23, %xmm21 10304; AVX512BW-NEXT: vporq %xmm1, %xmm21, %xmm1 10305; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] 10306; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10307; AVX512BW-NEXT: vinserti32x4 $2, %xmm21, %zmm1, %zmm1 10308; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 10309; AVX512BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 10310; AVX512BW-NEXT: kmovq %rax, %k1 10311; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm1 {%k1} 10312; AVX512BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 10313; AVX512BW-NEXT: kmovq %rax, %k1 10314; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm9 {%k1} 10315; AVX512BW-NEXT: vpshufb %xmm10, %xmm5, %xmm1 10316; AVX512BW-NEXT: vpshufb %xmm11, %xmm4, %xmm10 10317; AVX512BW-NEXT: vpor %xmm1, %xmm10, %xmm1 10318; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 10319; AVX512BW-NEXT: vpshufb %xmm19, %xmm4, %xmm4 10320; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 10321; AVX512BW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 10322; AVX512BW-NEXT: vpshufb %xmm25, %xmm6, %xmm4 10323; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 10324; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 10325; AVX512BW-NEXT: vpshufb %xmm18, %xmm4, %xmm4 10326; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 10327; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 10328; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] 10329; AVX512BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C 10330; AVX512BW-NEXT: kmovq %rax, %k1 10331; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} 10332; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10333; AVX512BW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 10334; AVX512BW-NEXT: vpshufb %xmm17, %xmm12, %xmm1 10335; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 10336; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] 10337; AVX512BW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 10338; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 10339; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 10340; AVX512BW-NEXT: vpermw %zmm13, %zmm1, %zmm1 10341; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] 10342; AVX512BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 10343; AVX512BW-NEXT: kmovq %rax, %k1 10344; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 10345; AVX512BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 10346; AVX512BW-NEXT: kmovq %rax, %k1 10347; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} 10348; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10349; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) 10350; AVX512BW-NEXT: vmovdqa64 %zmm8, 320(%rax) 10351; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rax) 10352; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) 10353; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) 10354; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) 10355; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rax) 10356; AVX512BW-NEXT: vzeroupper 10357; AVX512BW-NEXT: retq 10358; 10359; AVX512BW-FCP-LABEL: store_i8_stride7_vf64: 10360; AVX512BW-FCP: # %bb.0: 10361; AVX512BW-FCP-NEXT: subq $104, %rsp 10362; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10363; AVX512BW-FCP-NEXT: vmovdqa (%rax), %ymm2 10364; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 10365; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 10366; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm2, %ymm0 10367; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 10368; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] 10369; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1 10370; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 10371; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 10372; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10373; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 10374; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm3 10375; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm4 10376; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10377; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] 10378; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 10379; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 10380; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm16 10381; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm15 10382; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm16[8],xmm15[9],xmm16[9],xmm15[10],xmm16[10],xmm15[11],xmm16[11],xmm15[12],xmm16[12],xmm15[13],xmm16[13],xmm15[14],xmm16[14],xmm15[15],xmm16[15] 10383; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10384; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 10385; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 10386; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 10387; AVX512BW-FCP-NEXT: kmovq %r10, %k1 10388; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 10389; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm4 10390; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10391; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] 10392; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 10393; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 10394; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10395; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 10396; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 10397; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 10398; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm17 10399; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm19 10400; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] 10401; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 10402; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm6, %xmm6 10403; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 10404; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 10405; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 10406; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10407; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] 10408; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 10409; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 10410; AVX512BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10411; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 10412; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm7, %ymm21 10413; AVX512BW-FCP-NEXT: vporq %ymm6, %ymm21, %ymm6 10414; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm21 10415; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm22 10416; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] 10417; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 10418; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm24 10419; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1] 10420; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm24, %zmm14 10421; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 10422; AVX512BW-FCP-NEXT: kmovq %r10, %k1 10423; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm14 {%k1} 10424; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 10425; AVX512BW-FCP-NEXT: kmovq %r10, %k1 10426; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm14 {%k1} 10427; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 10428; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm4 10429; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 10430; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm5 10431; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 10432; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] 10433; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 10434; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] 10435; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm24 10436; AVX512BW-FCP-NEXT: vporq %ymm5, %ymm24, %ymm5 10437; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 10438; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 10439; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 10440; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 10441; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm25 10442; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm25, %ymm23 10443; AVX512BW-FCP-NEXT: vporq %ymm9, %ymm23, %ymm9 10444; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] 10445; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm23 10446; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] 10447; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm25, %ymm24 10448; AVX512BW-FCP-NEXT: vporq %ymm23, %ymm24, %ymm23 10449; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] 10450; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm9, %zmm9 10451; AVX512BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 10452; AVX512BW-FCP-NEXT: kmovq %r10, %k1 10453; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm9 {%k1} 10454; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm4 10455; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 10456; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm28 10457; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm28, %ymm1 10458; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 10459; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] 10460; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm28, %ymm1 10461; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] 10462; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm23 10463; AVX512BW-FCP-NEXT: vporq %ymm1, %ymm23, %ymm1 10464; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 10465; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 10466; AVX512BW-FCP-NEXT: vmovdqa 32(%rax), %ymm6 10467; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] 10468; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm23, %ymm23 10469; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm6, %ymm20 10470; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 10471; AVX512BW-FCP-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 10472; AVX512BW-FCP-NEXT: kmovq %r10, %k2 10473; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm1 {%k2} 10474; AVX512BW-FCP-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 10475; AVX512BW-FCP-NEXT: kmovq %r10, %k2 10476; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm9 {%k2} 10477; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 10478; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm23[4,5,6,7] 10479; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm1[23],zero,zmm1[21,22,23,26],zero,zmm1[24],zero,zmm1[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero 10480; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 10481; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm24[4,5,6,7] 10482; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61],zero,zero 10483; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 10484; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 10485; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm25[0,1,2,3],zmm26[4,5,6,7] 10486; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[27],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm2[60],zero,zmm2[62,63,62,63],zero,zmm2[61],zero,zmm2[63,60,61] 10487; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm27 10488; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm27[4,5,6,7] 10489; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[23],zero,zero,zero,zero,zmm3[26],zero,zmm3[24],zero,zero,zero,zero,zmm3[27],zero,zmm3[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm3[62],zero,zmm3[60],zero,zero,zero,zero,zmm3[63],zero,zmm3[61],zero,zero,zero 10490; AVX512BW-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2 10491; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10492; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7] 10493; AVX512BW-FCP-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 10494; AVX512BW-FCP-NEXT: kmovq %r10, %k2 10495; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm20 {%k2} 10496; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm25 10497; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm28[0,1,2,3],zmm25[4,5,6,7] 10498; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero 10499; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm28 10500; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm28[4,5,6,7] 10501; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61] 10502; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 10503; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10504; AVX512BW-FCP-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 10505; AVX512BW-FCP-NEXT: kmovq %r10, %k2 10506; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm20 {%k2} 10507; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm31 10508; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] 10509; AVX512BW-FCP-NEXT: vpermi2w %zmm31, %zmm6, %zmm1 10510; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 10511; AVX512BW-FCP-NEXT: kmovq %rax, %k3 10512; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm20 {%k3} 10513; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 10514; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 10515; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 10516; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 10517; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 10518; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm2 10519; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 10520; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 10521; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm2, %xmm2 10522; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm3 10523; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 10524; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 10525; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 10526; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm29 10527; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 10528; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm30 10529; AVX512BW-FCP-NEXT: vporq %xmm29, %xmm30, %xmm29 10530; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm30 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 10531; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm30, %xmm30 10532; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm30, %zmm29, %zmm29 10533; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] 10534; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm29 = zmm29[0,1,0,1,4,5,4,5] 10535; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm29 {%k2} 10536; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm30 10537; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm3 10538; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 10539; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm30, %xmm0 10540; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 10541; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm18 10542; AVX512BW-FCP-NEXT: vporq %xmm0, %xmm18, %xmm0 10543; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm3[8],xmm30[8],xmm3[9],xmm30[9],xmm3[10],xmm30[10],xmm3[11],xmm30[11],xmm3[12],xmm30[12],xmm3[13],xmm30[13],xmm3[14],xmm30[14],xmm3[15],xmm30[15] 10544; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10545; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm18, %zmm0, %zmm0 10546; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] 10547; AVX512BW-FCP-NEXT: vpermi2w %zmm31, %zmm6, %zmm18 10548; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] 10549; AVX512BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 10550; AVX512BW-FCP-NEXT: kmovq %rax, %k3 10551; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm0 {%k3} 10552; AVX512BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 10553; AVX512BW-FCP-NEXT: kmovq %rax, %k3 10554; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm29 {%k3} 10555; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm19, %xmm0 10556; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm17, %xmm6 10557; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 10558; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] 10559; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 10560; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 10561; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 10562; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm6 10563; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm21, %xmm8 10564; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 10565; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3],xmm21[4],xmm22[4],xmm21[5],xmm22[5],xmm21[6],xmm22[6],xmm21[7],xmm22[7] 10566; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 10567; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 10568; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm6 10569; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm0[0,1,0,1,4,5,4,5] 10570; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm6[0,1,0,1,4,5,4,5] 10571; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C 10572; AVX512BW-FCP-NEXT: kmovq %rax, %k3 10573; AVX512BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm0 {%k3} 10574; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm16, %xmm6 10575; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 10576; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 10577; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] 10578; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 10579; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 10580; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 10581; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5] 10582; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm31, %zmm8 # 32-byte Folded Reload 10583; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 10584; AVX512BW-FCP-NEXT: vpermw %zmm8, %zmm7, %zmm7 10585; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 10586; AVX512BW-FCP-NEXT: kmovq %rax, %k3 10587; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} 10588; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 10589; AVX512BW-FCP-NEXT: kmovq %rax, %k3 10590; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm0 {%k3} 10591; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 10592; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 10593; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 10594; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm13[28],zero,ymm13[30,31,30,31],zero,ymm13[29],zero,ymm13[31,28,29] 10595; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 10596; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero 10597; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm6, %ymm2 10598; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1] 10599; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 10600; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 10601; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10602; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero 10603; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10604; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero 10605; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 10606; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[2,3,2,3],zmm2[0,1,0,1] 10607; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} 10608; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3],xmm3[4],xmm30[4],xmm3[5],xmm30[5],xmm3[6],xmm30[6],xmm3[7],xmm30[7] 10609; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 10610; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10611; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero 10612; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10613; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] 10614; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 10615; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[0,1,0,1] 10616; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] 10617; AVX512BW-FCP-NEXT: vpermw %zmm31, %zmm3, %zmm3 10618; AVX512BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 10619; AVX512BW-FCP-NEXT: kmovq %rax, %k1 10620; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} 10621; AVX512BW-FCP-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E 10622; AVX512BW-FCP-NEXT: kmovq %rax, %k1 10623; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} 10624; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm26, %zmm1 10625; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero 10626; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm27, %zmm3 10627; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] 10628; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm3, %zmm1 10629; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm24, %zmm3 10630; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59] 10631; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm23, %zmm4 10632; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero 10633; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 10634; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10635; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] 10636; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm3 {%k2} 10637; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm28, %zmm1 10638; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] 10639; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm4 10640; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero 10641; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm4, %zmm1 10642; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 10643; AVX512BW-FCP-NEXT: vpermw %zmm8, %zmm4, %zmm4 10644; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10645; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 10646; AVX512BW-FCP-NEXT: kmovq %rax, %k1 10647; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1} 10648; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C 10649; AVX512BW-FCP-NEXT: kmovq %rax, %k1 10650; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} 10651; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10652; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) 10653; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) 10654; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) 10655; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 10656; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%rax) 10657; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) 10658; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 384(%rax) 10659; AVX512BW-FCP-NEXT: addq $104, %rsp 10660; AVX512BW-FCP-NEXT: vzeroupper 10661; AVX512BW-FCP-NEXT: retq 10662; 10663; AVX512DQ-BW-LABEL: store_i8_stride7_vf64: 10664; AVX512DQ-BW: # %bb.0: 10665; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10666; AVX512DQ-BW-NEXT: vmovdqa (%rax), %ymm13 10667; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 10668; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm13, %ymm0 10669; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 10670; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] 10671; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm1, %ymm1 10672; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 10673; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm9 10674; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 10675; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm9, %ymm1 10676; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm10 10677; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] 10678; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm10, %ymm2 10679; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm2, %ymm2 10680; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm1 10681; AVX512DQ-BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10682; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm12 10683; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 10684; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10685; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 10686; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm8 10687; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 10688; AVX512DQ-BW-NEXT: kmovq %r10, %k1 10689; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm8 {%k1} 10690; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 10691; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] 10692; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm14, %ymm2 10693; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm15 10694; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 10695; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm15, %ymm4 10696; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm4, %ymm2 10697; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 10698; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm5 10699; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 10700; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 10701; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 10702; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm22 10703; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm18 10704; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] 10705; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm18, %ymm2 10706; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm19 10707; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 10708; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm19, %ymm6 10709; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm6, %ymm2 10710; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm6 10711; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 10712; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 10713; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm23[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 10714; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,0,1] 10715; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm23, %zmm3 10716; AVX512DQ-BW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 10717; AVX512DQ-BW-NEXT: kmovq %r10, %k1 10718; AVX512DQ-BW-NEXT: vmovdqu8 %zmm22, %zmm3 {%k1} 10719; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 10720; AVX512DQ-BW-NEXT: kmovq %r10, %k1 10721; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm3 {%k1} 10722; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %ymm29 10723; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm29, %ymm0 10724; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm30 10725; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm30, %ymm8 10726; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm8, %ymm0 10727; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] 10728; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm29, %ymm8 10729; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] 10730; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm30, %ymm23 10731; AVX512DQ-BW-NEXT: vporq %ymm8, %ymm23, %ymm8 10732; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 10733; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 10734; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %ymm28 10735; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm16 10736; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm16[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 10737; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5] 10738; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm23 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] 10739; AVX512DQ-BW-NEXT: movl $676341840, %r10d # imm = 0x28502850 10740; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 10741; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm28, %ymm8 {%k1} 10742; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] 10743; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm16, %ymm24 10744; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm28, %ymm25 10745; AVX512DQ-BW-NEXT: vporq %ymm24, %ymm25, %ymm24 10746; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm24, %zmm8 10747; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 10748; AVX512DQ-BW-NEXT: kmovq %r10, %k2 10749; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2} 10750; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %ymm31 10751; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm31, %ymm17 10752; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm1 10753; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm1, %ymm21 10754; AVX512DQ-BW-NEXT: vporq %ymm17, %ymm21, %ymm17 10755; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] 10756; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm1, %ymm21 10757; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] 10758; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm31, %ymm27 10759; AVX512DQ-BW-NEXT: vporq %ymm21, %ymm27, %ymm21 10760; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] 10761; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm17, %zmm21 10762; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %ymm17 10763; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm17, %ymm27 10764; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 10765; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm26, %ymm11 10766; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 10767; AVX512DQ-BW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 10768; AVX512DQ-BW-NEXT: kmovq %r10, %k3 10769; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm21 {%k3} 10770; AVX512DQ-BW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 10771; AVX512DQ-BW-NEXT: kmovq %r10, %k3 10772; AVX512DQ-BW-NEXT: vmovdqu8 %zmm21, %zmm8 {%k3} 10773; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm16[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 10774; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] 10775; AVX512DQ-BW-NEXT: movl $338170920, %r10d # imm = 0x14281428 10776; AVX512DQ-BW-NEXT: kmovd %r10d, %k4 10777; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm27 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] 10778; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm28, %ymm11 {%k4} 10779; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] 10780; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] 10781; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm28, %ymm21 10782; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] 10783; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm16, %ymm16 10784; AVX512DQ-BW-NEXT: vporq %ymm21, %ymm16, %ymm16 10785; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm16 10786; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm29[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 10787; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm11[0,2,3,3,4,6,7,7] 10788; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] 10789; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm30, %ymm0 10790; AVX512DQ-BW-NEXT: vmovdqu8 %ymm21, %ymm0 {%k1} 10791; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm21 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm29[24,25],zero,ymm29[23],zero,ymm29[21,22,23,26],zero,ymm29[24],zero,ymm29[28,29,26,27] 10792; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm29 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero,zero,zero 10793; AVX512DQ-BW-NEXT: vporq %ymm21, %ymm29, %ymm21 10794; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 10795; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] 10796; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] 10797; AVX512DQ-BW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 10798; AVX512DQ-BW-NEXT: kmovq %r10, %k3 10799; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k3} 10800; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 10801; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7] 10802; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero 10803; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm29 10804; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm31[0,1,2,3],zmm29[4,5,6,7] 10805; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61] 10806; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm21, %zmm1 10807; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10808; AVX512DQ-BW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 10809; AVX512DQ-BW-NEXT: kmovq %r10, %k3 10810; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k3} 10811; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm21 10812; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] 10813; AVX512DQ-BW-NEXT: vpermi2w %zmm21, %zmm17, %zmm1 10814; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 10815; AVX512DQ-BW-NEXT: kmovq %rax, %k5 10816; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k5} 10817; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] 10818; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] 10819; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm19, %ymm1 {%k1} 10820; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm19, %ymm2 10821; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm18, %ymm23 10822; AVX512DQ-BW-NEXT: vporq %ymm2, %ymm23, %ymm2 10823; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 10824; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 10825; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 10826; AVX512DQ-BW-NEXT: vpshufb %zmm20, %zmm2, %zmm2 10827; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm20 10828; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm20, %zmm20 10829; AVX512DQ-BW-NEXT: vpshufb %zmm22, %zmm20, %zmm20 10830; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm20, %zmm2 10831; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10832; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7] 10833; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm20 {%k3} 10834; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm22 10835; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm29, %zmm1 10836; AVX512DQ-BW-NEXT: vpshufb %zmm24, %zmm1, %zmm1 10837; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm23 10838; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 10839; AVX512DQ-BW-NEXT: vpshufb %zmm25, %zmm0, %zmm2 10840; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm0 10841; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 10842; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm2 10843; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 10844; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm21, %zmm13 10845; AVX512DQ-BW-NEXT: vpermw %zmm13, %zmm26, %zmm24 10846; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 10847; AVX512DQ-BW-NEXT: kmovq %rax, %k5 10848; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm1 {%k5} 10849; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm24 10850; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C 10851; AVX512DQ-BW-NEXT: kmovq %rax, %k5 10852; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm20 {%k5} 10853; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm1 10854; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] 10855; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm18[2,2,3,3,6,6,7,7] 10856; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm19, %ymm25 {%k4} 10857; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm24[0],xmm1[0],xmm24[1],xmm1[1],xmm24[2],xmm1[2],xmm24[3],xmm1[3],xmm24[4],xmm1[4],xmm24[5],xmm1[5],xmm24[6],xmm1[6],xmm24[7],xmm1[7] 10858; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 10859; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm19, %xmm19 10860; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm25[2,3,2,3],zmm19[0,1,0,1] 10861; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 10862; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm15, %ymm11 10863; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 10864; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm26, %xmm15 10865; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] 10866; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,3,3,4,6,7,7] 10867; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k1} 10868; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm11[2,3,2,3],zmm15[0,1,0,1] 10869; AVX512DQ-BW-NEXT: vmovdqu8 %zmm25, %zmm14 {%k2} 10870; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] 10871; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 10872; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm11 10873; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero 10874; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29] 10875; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9 10876; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[2,3,2,3],zmm11[0,1,0,1] 10877; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] 10878; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm10, %zmm10 10879; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 10880; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10881; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k1} 10882; AVX512DQ-BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E 10883; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10884; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm14 {%k1} 10885; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 10886; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm2, %xmm9 10887; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 10888; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm0, %xmm25 10889; AVX512DQ-BW-NEXT: vporq %xmm9, %xmm25, %xmm9 10890; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 10891; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 10892; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm0 10893; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 10894; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm1, %xmm9 10895; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 10896; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm24, %xmm26 10897; AVX512DQ-BW-NEXT: vporq %xmm9, %xmm26, %xmm9 10898; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm24[8],xmm1[9],xmm24[9],xmm1[10],xmm24[10],xmm1[11],xmm24[11],xmm1[12],xmm24[12],xmm1[13],xmm24[13],xmm1[14],xmm24[14],xmm1[15],xmm24[15] 10899; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] 10900; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 10901; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 10902; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,1,0,1,4,5,4,5] 10903; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm9 {%k3} 10904; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 10905; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm22, %xmm1 10906; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] 10907; AVX512DQ-BW-NEXT: vpermi2w %zmm21, %zmm17, %zmm24 10908; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 10909; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm23, %xmm21 10910; AVX512DQ-BW-NEXT: vporq %xmm1, %xmm21, %xmm1 10911; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] 10912; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10913; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm21, %zmm1, %zmm1 10914; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 10915; AVX512DQ-BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 10916; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10917; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm1 {%k1} 10918; AVX512DQ-BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 10919; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10920; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm9 {%k1} 10921; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm5, %xmm1 10922; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm4, %xmm10 10923; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm10, %xmm1 10924; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 10925; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm4, %xmm4 10926; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 10927; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 10928; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm6, %xmm4 10929; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm4, %xmm2 10930; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 10931; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm4, %xmm4 10932; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 10933; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] 10934; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] 10935; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C 10936; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10937; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} 10938; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 10939; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 10940; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm12, %xmm1 10941; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 10942; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] 10943; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 10944; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 10945; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 10946; AVX512DQ-BW-NEXT: vpermw %zmm13, %zmm1, %zmm1 10947; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] 10948; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 10949; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10950; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 10951; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 10952; AVX512DQ-BW-NEXT: kmovq %rax, %k1 10953; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} 10954; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 10955; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) 10956; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 320(%rax) 10957; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 256(%rax) 10958; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%rax) 10959; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rax) 10960; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) 10961; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 384(%rax) 10962; AVX512DQ-BW-NEXT: vzeroupper 10963; AVX512DQ-BW-NEXT: retq 10964; 10965; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64: 10966; AVX512DQ-BW-FCP: # %bb.0: 10967; AVX512DQ-BW-FCP-NEXT: subq $104, %rsp 10968; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 10969; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %ymm2 10970; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 10971; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] 10972; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm2, %ymm0 10973; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] 10974; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] 10975; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1 10976; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 10977; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 10978; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10979; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] 10980; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm3 10981; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm4 10982; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10983; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] 10984; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 10985; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 10986; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm16 10987; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm15 10988; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm16[8],xmm15[9],xmm16[9],xmm15[10],xmm16[10],xmm15[11],xmm16[11],xmm15[12],xmm16[12],xmm15[13],xmm16[13],xmm15[14],xmm16[14],xmm15[15],xmm16[15] 10989; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 10990; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] 10991; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 10992; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 10993; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 10994; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} 10995; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm4 10996; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10997; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] 10998; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 10999; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 11000; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11001; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] 11002; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 11003; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 11004; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm17 11005; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm19 11006; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] 11007; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] 11008; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm6, %xmm6 11009; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] 11010; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 11011; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 11012; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11013; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] 11014; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 11015; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 11016; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11017; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] 11018; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm7, %ymm21 11019; AVX512DQ-BW-FCP-NEXT: vporq %ymm6, %ymm21, %ymm6 11020; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm21 11021; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm22 11022; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] 11023; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] 11024; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm24 11025; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1] 11026; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm24, %zmm14 11027; AVX512DQ-BW-FCP-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 11028; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 11029; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm14 {%k1} 11030; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 11031; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 11032; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm14 {%k1} 11033; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 11034; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm4 11035; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 11036; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm5 11037; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 11038; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] 11039; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 11040; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] 11041; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm24 11042; AVX512DQ-BW-FCP-NEXT: vporq %ymm5, %ymm24, %ymm5 11043; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 11044; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 11045; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 11046; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 11047; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm25 11048; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm25, %ymm23 11049; AVX512DQ-BW-FCP-NEXT: vporq %ymm9, %ymm23, %ymm9 11050; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] 11051; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm23 11052; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] 11053; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm25, %ymm24 11054; AVX512DQ-BW-FCP-NEXT: vporq %ymm23, %ymm24, %ymm23 11055; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] 11056; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm9, %zmm9 11057; AVX512DQ-BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 11058; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 11059; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm9 {%k1} 11060; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm4 11061; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 11062; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm28 11063; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm28, %ymm1 11064; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 11065; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] 11066; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm28, %ymm1 11067; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] 11068; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm23 11069; AVX512DQ-BW-FCP-NEXT: vporq %ymm1, %ymm23, %ymm1 11070; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 11071; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 11072; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rax), %ymm6 11073; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] 11074; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm23, %ymm23 11075; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm6, %ymm20 11076; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 11077; AVX512DQ-BW-FCP-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 11078; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 11079; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm1 {%k2} 11080; AVX512DQ-BW-FCP-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 11081; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 11082; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm9 {%k2} 11083; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 11084; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm23[4,5,6,7] 11085; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm1[23],zero,zmm1[21,22,23,26],zero,zmm1[24],zero,zmm1[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero 11086; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 11087; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm24[4,5,6,7] 11088; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61],zero,zero 11089; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 11090; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 11091; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm25[0,1,2,3],zmm26[4,5,6,7] 11092; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[27],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm2[60],zero,zmm2[62,63,62,63],zero,zmm2[61],zero,zmm2[63,60,61] 11093; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm27 11094; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm27[4,5,6,7] 11095; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[23],zero,zero,zero,zero,zmm3[26],zero,zmm3[24],zero,zero,zero,zero,zmm3[27],zero,zmm3[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm3[62],zero,zmm3[60],zero,zero,zero,zero,zmm3[63],zero,zmm3[61],zero,zero,zero 11096; AVX512DQ-BW-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2 11097; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 11098; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7] 11099; AVX512DQ-BW-FCP-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 11100; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 11101; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm20 {%k2} 11102; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm25 11103; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm28[0,1,2,3],zmm25[4,5,6,7] 11104; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero 11105; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm28 11106; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm28[4,5,6,7] 11107; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61] 11108; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 11109; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 11110; AVX512DQ-BW-FCP-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 11111; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 11112; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm20 {%k2} 11113; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm31 11114; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] 11115; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm31, %zmm6, %zmm1 11116; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 11117; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 11118; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm20 {%k3} 11119; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 11120; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 11121; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] 11122; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 11123; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] 11124; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm2 11125; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 11126; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 11127; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm2, %xmm2 11128; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm3 11129; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 11130; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 11131; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] 11132; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm29 11133; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] 11134; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm30 11135; AVX512DQ-BW-FCP-NEXT: vporq %xmm29, %xmm30, %xmm29 11136; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm30 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 11137; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm30, %xmm30 11138; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm30, %zmm29, %zmm29 11139; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] 11140; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm29 = zmm29[0,1,0,1,4,5,4,5] 11141; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm29 {%k2} 11142; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm30 11143; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm3 11144; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] 11145; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm30, %xmm0 11146; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] 11147; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm18 11148; AVX512DQ-BW-FCP-NEXT: vporq %xmm0, %xmm18, %xmm0 11149; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm3[8],xmm30[8],xmm3[9],xmm30[9],xmm3[10],xmm30[10],xmm3[11],xmm30[11],xmm3[12],xmm30[12],xmm3[13],xmm30[13],xmm3[14],xmm30[14],xmm3[15],xmm30[15] 11150; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] 11151; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm18, %zmm0, %zmm0 11152; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] 11153; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm31, %zmm6, %zmm18 11154; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] 11155; AVX512DQ-BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 11156; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 11157; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm0 {%k3} 11158; AVX512DQ-BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 11159; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 11160; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm29 {%k3} 11161; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm19, %xmm0 11162; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm17, %xmm6 11163; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 11164; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] 11165; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] 11166; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 11167; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 11168; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm6 11169; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm21, %xmm8 11170; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 11171; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3],xmm21[4],xmm22[4],xmm21[5],xmm22[5],xmm21[6],xmm22[6],xmm21[7],xmm22[7] 11172; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] 11173; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 11174; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm6 11175; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm0[0,1,0,1,4,5,4,5] 11176; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm6[0,1,0,1,4,5,4,5] 11177; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C 11178; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 11179; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm0 {%k3} 11180; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm16, %xmm6 11181; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 11182; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 11183; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] 11184; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] 11185; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 11186; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 11187; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5] 11188; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm31, %zmm8 # 32-byte Folded Reload 11189; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] 11190; AVX512DQ-BW-FCP-NEXT: vpermw %zmm8, %zmm7, %zmm7 11191; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 11192; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 11193; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} 11194; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 11195; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 11196; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm0 {%k3} 11197; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 11198; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 11199; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 11200; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm13[28],zero,ymm13[30,31,30,31],zero,ymm13[29],zero,ymm13[31,28,29] 11201; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 11202; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero 11203; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm6, %ymm2 11204; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1] 11205; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 11206; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 11207; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 11208; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero 11209; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11210; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero 11211; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 11212; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[2,3,2,3],zmm2[0,1,0,1] 11213; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} 11214; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3],xmm3[4],xmm30[4],xmm3[5],xmm30[5],xmm3[6],xmm30[6],xmm3[7],xmm30[7] 11215; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11216; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11217; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero 11218; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11219; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] 11220; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 11221; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[0,1,0,1] 11222; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] 11223; AVX512DQ-BW-FCP-NEXT: vpermw %zmm31, %zmm3, %zmm3 11224; AVX512DQ-BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 11225; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 11226; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} 11227; AVX512DQ-BW-FCP-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E 11228; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 11229; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} 11230; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm26, %zmm1 11231; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero 11232; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm27, %zmm3 11233; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] 11234; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm3, %zmm1 11235; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm24, %zmm3 11236; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59] 11237; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm23, %zmm4 11238; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero 11239; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 11240; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 11241; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] 11242; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm3 {%k2} 11243; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm28, %zmm1 11244; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] 11245; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm4 11246; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero 11247; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm4, %zmm1 11248; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] 11249; AVX512DQ-BW-FCP-NEXT: vpermw %zmm8, %zmm4, %zmm4 11250; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] 11251; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 11252; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 11253; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k1} 11254; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C 11255; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 11256; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} 11257; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11258; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) 11259; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) 11260; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) 11261; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 11262; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%rax) 11263; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) 11264; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 384(%rax) 11265; AVX512DQ-BW-FCP-NEXT: addq $104, %rsp 11266; AVX512DQ-BW-FCP-NEXT: vzeroupper 11267; AVX512DQ-BW-FCP-NEXT: retq 11268 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 11269 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 11270 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 11271 %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64 11272 %in.vec4 = load <64 x i8>, ptr %in.vecptr4, align 64 11273 %in.vec5 = load <64 x i8>, ptr %in.vecptr5, align 64 11274 %in.vec6 = load <64 x i8>, ptr %in.vecptr6, align 64 11275 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 11276 %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 11277 %3 = shufflevector <64 x i8> %in.vec4, <64 x i8> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 11278 %4 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 11279 %5 = shufflevector <64 x i8> %in.vec6, <64 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 11280 %6 = shufflevector <128 x i8> %3, <128 x i8> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191> 11281 %7 = shufflevector <192 x i8> %6, <192 x i8> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 11282 %8 = shufflevector <256 x i8> %4, <256 x i8> %7, <448 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447> 11283 %interleaved.vec = shufflevector <448 x i8> %8, <448 x i8> poison, <448 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447> 11284 store <448 x i8> %interleaved.vec, ptr %out.vec, align 64 11285 ret void 11286} 11287