1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i8_stride5_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: movdqa (%rdx), %xmm1 23; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 24; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 25; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 26; SSE-NEXT: pxor %xmm1, %xmm1 27; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 28; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] 29; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 30; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] 31; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 32; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 33; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,7,5] 34; SSE-NEXT: packuswb %xmm0, %xmm1 35; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,255,255] 36; SSE-NEXT: pand %xmm0, %xmm1 37; SSE-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] 38; SSE-NEXT: pandn %xmm2, %xmm0 39; SSE-NEXT: por %xmm1, %xmm0 40; SSE-NEXT: movq %xmm0, (%r9) 41; SSE-NEXT: pextrw $4, %xmm0, %eax 42; SSE-NEXT: movw %ax, 8(%r9) 43; SSE-NEXT: retq 44; 45; AVX-LABEL: store_i8_stride5_vf2: 46; AVX: # %bb.0: 47; AVX-NEXT: vmovdqa (%rdi), %xmm0 48; AVX-NEXT: vmovdqa (%rdx), %xmm1 49; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 50; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 51; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 52; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 53; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 54; AVX-NEXT: vpextrw $4, %xmm0, 8(%r9) 55; AVX-NEXT: vmovq %xmm0, (%r9) 56; AVX-NEXT: retq 57; 58; AVX2-LABEL: store_i8_stride5_vf2: 59; AVX2: # %bb.0: 60; AVX2-NEXT: vmovdqa (%rdi), %xmm0 61; AVX2-NEXT: vmovdqa (%rdx), %xmm1 62; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 63; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 64; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 65; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 66; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 67; AVX2-NEXT: vpextrw $4, %xmm0, 8(%r9) 68; AVX2-NEXT: vmovq %xmm0, (%r9) 69; AVX2-NEXT: retq 70; 71; AVX2-FP-LABEL: store_i8_stride5_vf2: 72; AVX2-FP: # %bb.0: 73; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 74; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 75; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 76; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 77; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 78; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 79; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 80; AVX2-FP-NEXT: vpextrw $4, %xmm0, 8(%r9) 81; AVX2-FP-NEXT: vmovq %xmm0, (%r9) 82; AVX2-FP-NEXT: retq 83; 84; AVX2-FCP-LABEL: store_i8_stride5_vf2: 85; AVX2-FCP: # %bb.0: 86; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 87; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 88; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 89; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 90; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 91; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 92; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 93; AVX2-FCP-NEXT: vpextrw $4, %xmm0, 8(%r9) 94; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) 95; AVX2-FCP-NEXT: retq 96; 97; AVX512-LABEL: store_i8_stride5_vf2: 98; AVX512: # %bb.0: 99; AVX512-NEXT: vmovdqa (%rdi), %xmm0 100; AVX512-NEXT: vmovdqa (%rdx), %xmm1 101; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 102; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 103; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 104; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 105; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 106; AVX512-NEXT: vpextrw $4, %xmm0, 8(%r9) 107; AVX512-NEXT: vmovq %xmm0, (%r9) 108; AVX512-NEXT: retq 109; 110; AVX512-FCP-LABEL: store_i8_stride5_vf2: 111; AVX512-FCP: # %bb.0: 112; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 113; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 114; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 115; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 116; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 117; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 118; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 119; AVX512-FCP-NEXT: vpextrw $4, %xmm0, 8(%r9) 120; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) 121; AVX512-FCP-NEXT: retq 122; 123; AVX512DQ-LABEL: store_i8_stride5_vf2: 124; AVX512DQ: # %bb.0: 125; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 126; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 127; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 128; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 129; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 130; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 131; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 132; AVX512DQ-NEXT: vpextrw $4, %xmm0, 8(%r9) 133; AVX512DQ-NEXT: vmovq %xmm0, (%r9) 134; AVX512DQ-NEXT: retq 135; 136; AVX512DQ-FCP-LABEL: store_i8_stride5_vf2: 137; AVX512DQ-FCP: # %bb.0: 138; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 139; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 140; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 141; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 142; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 143; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 144; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 145; AVX512DQ-FCP-NEXT: vpextrw $4, %xmm0, 8(%r9) 146; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) 147; AVX512DQ-FCP-NEXT: retq 148; 149; AVX512BW-LABEL: store_i8_stride5_vf2: 150; AVX512BW: # %bb.0: 151; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 152; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 153; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 154; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 155; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 156; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 157; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 158; AVX512BW-NEXT: vpextrw $4, %xmm0, 8(%r9) 159; AVX512BW-NEXT: vmovq %xmm0, (%r9) 160; AVX512BW-NEXT: retq 161; 162; AVX512BW-FCP-LABEL: store_i8_stride5_vf2: 163; AVX512BW-FCP: # %bb.0: 164; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 165; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 166; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 167; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 168; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 169; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 170; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 171; AVX512BW-FCP-NEXT: vpextrw $4, %xmm0, 8(%r9) 172; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) 173; AVX512BW-FCP-NEXT: retq 174; 175; AVX512DQ-BW-LABEL: store_i8_stride5_vf2: 176; AVX512DQ-BW: # %bb.0: 177; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 178; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 179; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 180; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 181; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 182; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 183; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 184; AVX512DQ-BW-NEXT: vpextrw $4, %xmm0, 8(%r9) 185; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) 186; AVX512DQ-BW-NEXT: retq 187; 188; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf2: 189; AVX512DQ-BW-FCP: # %bb.0: 190; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 191; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 192; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 193; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 194; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 195; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 196; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] 197; AVX512DQ-BW-FCP-NEXT: vpextrw $4, %xmm0, 8(%r9) 198; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) 199; AVX512DQ-BW-FCP-NEXT: retq 200 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 201 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 202 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 203 %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64 204 %in.vec4 = load <2 x i8>, ptr %in.vecptr4, align 64 205 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 206 %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 207 %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 208 %4 = shufflevector <2 x i8> %in.vec4, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 209 %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 210 %interleaved.vec = shufflevector <10 x i8> %5, <10 x i8> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9> 211 store <10 x i8> %interleaved.vec, ptr %out.vec, align 64 212 ret void 213} 214 215define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 216; SSE-LABEL: store_i8_stride5_vf4: 217; SSE: # %bb.0: 218; SSE-NEXT: movdqa (%rdi), %xmm1 219; SSE-NEXT: movdqa (%rdx), %xmm2 220; SSE-NEXT: movdqa (%r8), %xmm0 221; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 222; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] 223; SSE-NEXT: pxor %xmm3, %xmm3 224; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 225; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,1,2,1] 226; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] 227; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] 228; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 229; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,1,3] 230; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] 231; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 232; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 233; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 234; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,0,0] 235; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7] 236; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,0,65535,65535,65535,0] 237; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,0] 238; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 239; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,5,7] 240; SSE-NEXT: pand %xmm5, %xmm6 241; SSE-NEXT: pandn %xmm4, %xmm5 242; SSE-NEXT: por %xmm6, %xmm5 243; SSE-NEXT: packuswb %xmm3, %xmm5 244; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 245; SSE-NEXT: pand %xmm3, %xmm5 246; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] 247; SSE-NEXT: pandn %xmm4, %xmm3 248; SSE-NEXT: por %xmm5, %xmm3 249; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 250; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 251; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] 252; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 253; SSE-NEXT: por %xmm1, %xmm2 254; SSE-NEXT: packuswb %xmm2, %xmm2 255; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] 256; SSE-NEXT: pand %xmm1, %xmm2 257; SSE-NEXT: pandn %xmm0, %xmm1 258; SSE-NEXT: por %xmm2, %xmm1 259; SSE-NEXT: movd %xmm1, 16(%r9) 260; SSE-NEXT: movdqa %xmm3, (%r9) 261; SSE-NEXT: retq 262; 263; AVX-LABEL: store_i8_stride5_vf4: 264; AVX: # %bb.0: 265; AVX-NEXT: vmovdqa (%rdi), %xmm0 266; AVX-NEXT: vmovdqa (%rdx), %xmm1 267; AVX-NEXT: vmovdqa (%r8), %xmm2 268; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 269; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 270; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 271; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,4,8,12],zero,xmm0[1,5,9,13],zero,xmm0[2,6,10,14],zero,xmm0[3] 272; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero 273; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 274; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6] 275; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 276; AVX-NEXT: vmovd %xmm0, 16(%r9) 277; AVX-NEXT: vmovdqa %xmm1, (%r9) 278; AVX-NEXT: retq 279; 280; AVX2-LABEL: store_i8_stride5_vf4: 281; AVX2: # %bb.0: 282; AVX2-NEXT: vmovdqa (%rdi), %xmm0 283; AVX2-NEXT: vmovdqa (%rdx), %xmm1 284; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 285; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 286; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 287; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 288; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 289; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 290; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 291; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 292; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 293; AVX2-NEXT: vmovd %xmm1, 16(%r9) 294; AVX2-NEXT: vmovdqa %xmm0, (%r9) 295; AVX2-NEXT: vzeroupper 296; AVX2-NEXT: retq 297; 298; AVX2-FP-LABEL: store_i8_stride5_vf4: 299; AVX2-FP: # %bb.0: 300; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 301; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 302; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 303; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 304; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 305; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 306; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 307; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 308; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 309; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 310; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 311; AVX2-FP-NEXT: vmovd %xmm1, 16(%r9) 312; AVX2-FP-NEXT: vmovdqa %xmm0, (%r9) 313; AVX2-FP-NEXT: vzeroupper 314; AVX2-FP-NEXT: retq 315; 316; AVX2-FCP-LABEL: store_i8_stride5_vf4: 317; AVX2-FCP: # %bb.0: 318; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 319; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 320; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 321; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 322; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 323; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 324; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 325; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 326; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 327; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 328; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 329; AVX2-FCP-NEXT: vmovd %xmm1, 16(%r9) 330; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r9) 331; AVX2-FCP-NEXT: vzeroupper 332; AVX2-FCP-NEXT: retq 333; 334; AVX512-LABEL: store_i8_stride5_vf4: 335; AVX512: # %bb.0: 336; AVX512-NEXT: vmovdqa (%rdi), %xmm0 337; AVX512-NEXT: vmovdqa (%rdx), %xmm1 338; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 339; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 340; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 341; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 342; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u] 343; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 344; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 345; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 346; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 347; AVX512-NEXT: vmovd %xmm1, 16(%r9) 348; AVX512-NEXT: vmovdqa %xmm0, (%r9) 349; AVX512-NEXT: vzeroupper 350; AVX512-NEXT: retq 351; 352; AVX512-FCP-LABEL: store_i8_stride5_vf4: 353; AVX512-FCP: # %bb.0: 354; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 355; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 356; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 357; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 358; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 359; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 360; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u] 361; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 362; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 363; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 364; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 365; AVX512-FCP-NEXT: vmovd %xmm1, 16(%r9) 366; AVX512-FCP-NEXT: vmovdqa %xmm0, (%r9) 367; AVX512-FCP-NEXT: vzeroupper 368; AVX512-FCP-NEXT: retq 369; 370; AVX512DQ-LABEL: store_i8_stride5_vf4: 371; AVX512DQ: # %bb.0: 372; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 373; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 374; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 375; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 376; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 377; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 378; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u] 379; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 380; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 381; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 382; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 383; AVX512DQ-NEXT: vmovd %xmm1, 16(%r9) 384; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9) 385; AVX512DQ-NEXT: vzeroupper 386; AVX512DQ-NEXT: retq 387; 388; AVX512DQ-FCP-LABEL: store_i8_stride5_vf4: 389; AVX512DQ-FCP: # %bb.0: 390; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 391; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 392; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 393; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 394; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 395; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 396; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u] 397; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 398; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 399; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 400; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 401; AVX512DQ-FCP-NEXT: vmovd %xmm1, 16(%r9) 402; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%r9) 403; AVX512DQ-FCP-NEXT: vzeroupper 404; AVX512DQ-FCP-NEXT: retq 405; 406; AVX512BW-LABEL: store_i8_stride5_vf4: 407; AVX512BW: # %bb.0: 408; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 409; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 410; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 411; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 412; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 413; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 414; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 415; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 416; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 417; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 418; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 419; AVX512BW-NEXT: vmovd %xmm1, 16(%r9) 420; AVX512BW-NEXT: vmovdqa %xmm0, (%r9) 421; AVX512BW-NEXT: vzeroupper 422; AVX512BW-NEXT: retq 423; 424; AVX512BW-FCP-LABEL: store_i8_stride5_vf4: 425; AVX512BW-FCP: # %bb.0: 426; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 427; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 428; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 429; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 430; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 431; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 432; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 433; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 434; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 435; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 436; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 437; AVX512BW-FCP-NEXT: vmovd %xmm1, 16(%r9) 438; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r9) 439; AVX512BW-FCP-NEXT: vzeroupper 440; AVX512BW-FCP-NEXT: retq 441; 442; AVX512DQ-BW-LABEL: store_i8_stride5_vf4: 443; AVX512DQ-BW: # %bb.0: 444; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 445; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 446; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 447; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 448; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 449; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 450; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 451; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 452; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 453; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 454; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 455; AVX512DQ-BW-NEXT: vmovd %xmm1, 16(%r9) 456; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%r9) 457; AVX512DQ-BW-NEXT: vzeroupper 458; AVX512DQ-BW-NEXT: retq 459; 460; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf4: 461; AVX512DQ-BW-FCP: # %bb.0: 462; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 463; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 464; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 465; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 466; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 467; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 468; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 469; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 470; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 471; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 472; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 473; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, 16(%r9) 474; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r9) 475; AVX512DQ-BW-FCP-NEXT: vzeroupper 476; AVX512DQ-BW-FCP-NEXT: retq 477 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 478 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 479 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 480 %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64 481 %in.vec4 = load <4 x i8>, ptr %in.vecptr4, align 64 482 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 483 %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 484 %3 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 485 %4 = shufflevector <4 x i8> %in.vec4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 486 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19> 487 %interleaved.vec = shufflevector <20 x i8> %5, <20 x i8> poison, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19> 488 store <20 x i8> %interleaved.vec, ptr %out.vec, align 64 489 ret void 490} 491 492define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 493; SSE-LABEL: store_i8_stride5_vf8: 494; SSE: # %bb.0: 495; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 496; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 497; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 498; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 499; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 500; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] 501; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,65535,65535,0] 502; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 503; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] 504; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,0,4,5,6,7] 505; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 506; SSE-NEXT: pand %xmm8, %xmm6 507; SSE-NEXT: pandn %xmm5, %xmm8 508; SSE-NEXT: por %xmm6, %xmm8 509; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] 510; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[1,2,2,3,4,5,6,7] 511; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] 512; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 513; SSE-NEXT: movdqa %xmm6, %xmm10 514; SSE-NEXT: pandn %xmm5, %xmm10 515; SSE-NEXT: movdqa %xmm2, %xmm7 516; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 517; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,1,2,3] 518; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7] 519; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] 520; SSE-NEXT: pand %xmm6, %xmm5 521; SSE-NEXT: por %xmm10, %xmm5 522; SSE-NEXT: pand %xmm9, %xmm5 523; SSE-NEXT: pandn %xmm8, %xmm9 524; SSE-NEXT: por %xmm5, %xmm9 525; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 526; SSE-NEXT: pand %xmm8, %xmm9 527; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] 528; SSE-NEXT: movdqa %xmm8, %xmm5 529; SSE-NEXT: pandn %xmm10, %xmm5 530; SSE-NEXT: por %xmm9, %xmm5 531; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm4[1,0,2,3,4,5,6,7] 532; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] 533; SSE-NEXT: pand %xmm8, %xmm9 534; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,0,2,3,4,5,6,7] 535; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] 536; SSE-NEXT: pandn %xmm10, %xmm8 537; SSE-NEXT: por %xmm9, %xmm8 538; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 539; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] 540; SSE-NEXT: pand %xmm6, %xmm10 541; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] 542; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,1,3] 543; SSE-NEXT: pandn %xmm7, %xmm6 544; SSE-NEXT: por %xmm10, %xmm6 545; SSE-NEXT: pand %xmm9, %xmm6 546; SSE-NEXT: pandn %xmm8, %xmm9 547; SSE-NEXT: por %xmm6, %xmm9 548; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 549; SSE-NEXT: pand %xmm6, %xmm9 550; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] 551; SSE-NEXT: pandn %xmm7, %xmm6 552; SSE-NEXT: por %xmm9, %xmm6 553; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] 554; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,255,255,255,255,255] 555; SSE-NEXT: pand %xmm7, %xmm4 556; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 557; SSE-NEXT: pandn %xmm3, %xmm7 558; SSE-NEXT: por %xmm4, %xmm7 559; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] 560; SSE-NEXT: pand %xmm3, %xmm7 561; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 562; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,7,7,7] 563; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 564; SSE-NEXT: pandn %xmm1, %xmm3 565; SSE-NEXT: por %xmm7, %xmm3 566; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,255,255,255,255] 567; SSE-NEXT: pand %xmm1, %xmm3 568; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 569; SSE-NEXT: pandn %xmm0, %xmm1 570; SSE-NEXT: por %xmm3, %xmm1 571; SSE-NEXT: movq %xmm1, 32(%r9) 572; SSE-NEXT: movdqa %xmm6, (%r9) 573; SSE-NEXT: movdqa %xmm5, 16(%r9) 574; SSE-NEXT: retq 575; 576; AVX-LABEL: store_i8_stride5_vf8: 577; AVX: # %bb.0: 578; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 579; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 580; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 581; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 582; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 583; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 584; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 585; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 586; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 587; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 588; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 589; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,13,6,8,10,12,15,u,u,u,u,u,u,u,u] 590; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u],zero,zero,xmm1[1,9,u],zero,zero,xmm1[2,10,u],zero 591; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,1,9],zero,zero,xmm0[u,2,10],zero,zero,xmm0[u,3] 592; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 593; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] 594; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero 595; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 596; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[3,11,u],zero,zero,xmm1[4,12,u],zero,zero,xmm1[5,13,u],zero,zero 597; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,xmm0[u,4,12],zero,zero,xmm0[u,5,13],zero,zero,xmm0[u,6,14] 598; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 599; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15] 600; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3],zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,xmm2[5],zero,zero 601; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 602; AVX-NEXT: vmovdqa %xmm0, 16(%r9) 603; AVX-NEXT: vmovdqa %xmm4, (%r9) 604; AVX-NEXT: vmovq %xmm3, 32(%r9) 605; AVX-NEXT: retq 606; 607; AVX2-LABEL: store_i8_stride5_vf8: 608; AVX2: # %bb.0: 609; AVX2-NEXT: movq (%r8), %rax 610; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 611; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 612; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 613; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 614; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 615; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 616; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 617; AVX2-NEXT: vmovq %rax, %xmm3 618; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 619; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 620; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 621; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 622; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 623; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] 624; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 625; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 626; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 627; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 628; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 629; AVX2-NEXT: shrq $48, %rax 630; AVX2-NEXT: vmovd %eax, %xmm1 631; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 632; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0] 633; AVX2-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 634; AVX2-NEXT: vmovq %xmm0, 32(%r9) 635; AVX2-NEXT: vmovdqa %ymm2, (%r9) 636; AVX2-NEXT: vzeroupper 637; AVX2-NEXT: retq 638; 639; AVX2-FP-LABEL: store_i8_stride5_vf8: 640; AVX2-FP: # %bb.0: 641; AVX2-FP-NEXT: movq (%r8), %rax 642; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 643; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 644; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 645; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 646; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 647; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 648; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 649; AVX2-FP-NEXT: vmovq %rax, %xmm3 650; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 651; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 652; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 653; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2 654; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 655; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] 656; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 657; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 658; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 659; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 660; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 661; AVX2-FP-NEXT: shrq $48, %rax 662; AVX2-FP-NEXT: vmovd %eax, %xmm1 663; AVX2-FP-NEXT: vpbroadcastw %xmm1, %xmm1 664; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0] 665; AVX2-FP-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 666; AVX2-FP-NEXT: vmovq %xmm0, 32(%r9) 667; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) 668; AVX2-FP-NEXT: vzeroupper 669; AVX2-FP-NEXT: retq 670; 671; AVX2-FCP-LABEL: store_i8_stride5_vf8: 672; AVX2-FCP: # %bb.0: 673; AVX2-FCP-NEXT: movq (%r8), %rax 674; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 675; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 676; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 677; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 678; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 679; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 680; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 681; AVX2-FCP-NEXT: vmovq %rax, %xmm3 682; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 683; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 684; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 685; AVX2-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 686; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] 687; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 688; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 689; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 690; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 691; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 692; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 693; AVX2-FCP-NEXT: shrq $48, %rax 694; AVX2-FCP-NEXT: vmovd %eax, %xmm1 695; AVX2-FCP-NEXT: vpbroadcastw %xmm1, %xmm1 696; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0] 697; AVX2-FCP-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 698; AVX2-FCP-NEXT: vmovq %xmm0, 32(%r9) 699; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) 700; AVX2-FCP-NEXT: vzeroupper 701; AVX2-FCP-NEXT: retq 702; 703; AVX512-LABEL: store_i8_stride5_vf8: 704; AVX512: # %bb.0: 705; AVX512-NEXT: movq (%r8), %rax 706; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 707; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 708; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 709; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 710; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 711; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 712; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 713; AVX512-NEXT: vmovq %rax, %xmm3 714; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero 715; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 716; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] 717; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 718; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 719; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] 720; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) 721; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 722; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 723; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 724; AVX512-NEXT: shrq $48, %rax 725; AVX512-NEXT: vmovd %eax, %xmm1 726; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 727; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) 728; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 729; AVX512-NEXT: vmovq %xmm1, 32(%r9) 730; AVX512-NEXT: vmovdqa %ymm0, (%r9) 731; AVX512-NEXT: vzeroupper 732; AVX512-NEXT: retq 733; 734; AVX512-FCP-LABEL: store_i8_stride5_vf8: 735; AVX512-FCP: # %bb.0: 736; AVX512-FCP-NEXT: movq (%r8), %rax 737; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 738; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 739; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 740; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 741; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 742; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 743; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 744; AVX512-FCP-NEXT: vmovq %rax, %xmm3 745; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero 746; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 747; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] 748; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 749; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] 750; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 751; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) 752; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 753; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 754; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 755; AVX512-FCP-NEXT: shrq $48, %rax 756; AVX512-FCP-NEXT: vmovd %eax, %xmm1 757; AVX512-FCP-NEXT: vpbroadcastw %xmm1, %xmm1 758; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) 759; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 760; AVX512-FCP-NEXT: vmovq %xmm1, 32(%r9) 761; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r9) 762; AVX512-FCP-NEXT: vzeroupper 763; AVX512-FCP-NEXT: retq 764; 765; AVX512DQ-LABEL: store_i8_stride5_vf8: 766; AVX512DQ: # %bb.0: 767; AVX512DQ-NEXT: movq (%r8), %rax 768; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 769; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 770; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 771; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 772; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 773; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 774; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 775; AVX512DQ-NEXT: vmovq %rax, %xmm3 776; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero 777; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 778; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] 779; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 780; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 781; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] 782; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) 783; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 784; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 785; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 786; AVX512DQ-NEXT: shrq $48, %rax 787; AVX512DQ-NEXT: vmovd %eax, %xmm1 788; AVX512DQ-NEXT: vpbroadcastw %xmm1, %xmm1 789; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) 790; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 791; AVX512DQ-NEXT: vmovq %xmm1, 32(%r9) 792; AVX512DQ-NEXT: vmovdqa %ymm0, (%r9) 793; AVX512DQ-NEXT: vzeroupper 794; AVX512DQ-NEXT: retq 795; 796; AVX512DQ-FCP-LABEL: store_i8_stride5_vf8: 797; AVX512DQ-FCP: # %bb.0: 798; AVX512DQ-FCP-NEXT: movq (%r8), %rax 799; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 800; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 801; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 802; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 803; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 804; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 805; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 806; AVX512DQ-FCP-NEXT: vmovq %rax, %xmm3 807; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero 808; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 809; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] 810; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 811; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] 812; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 813; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) 814; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 815; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 816; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 817; AVX512DQ-FCP-NEXT: shrq $48, %rax 818; AVX512DQ-FCP-NEXT: vmovd %eax, %xmm1 819; AVX512DQ-FCP-NEXT: vpbroadcastw %xmm1, %xmm1 820; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) 821; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 822; AVX512DQ-FCP-NEXT: vmovq %xmm1, 32(%r9) 823; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r9) 824; AVX512DQ-FCP-NEXT: vzeroupper 825; AVX512DQ-FCP-NEXT: retq 826; 827; AVX512BW-LABEL: store_i8_stride5_vf8: 828; AVX512BW: # %bb.0: 829; AVX512BW-NEXT: movq (%r8), %rax 830; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 831; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 832; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 833; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 834; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 835; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 836; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 837; AVX512BW-NEXT: vmovq %rax, %xmm3 838; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 839; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 840; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 841; AVX512BW-NEXT: vpor %ymm4, %ymm2, %ymm2 842; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 843; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] 844; AVX512BW-NEXT: movl $554189328, %ecx # imm = 0x21084210 845; AVX512BW-NEXT: kmovd %ecx, %k1 846; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} 847; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 848; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 849; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 850; AVX512BW-NEXT: shrq $48, %rax 851; AVX512BW-NEXT: vpbroadcastw %eax, %xmm1 852; AVX512BW-NEXT: movw $132, %ax 853; AVX512BW-NEXT: kmovd %eax, %k1 854; AVX512BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 855; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 856; AVX512BW-NEXT: vmovq %xmm0, 32(%r9) 857; AVX512BW-NEXT: vmovdqa %ymm1, (%r9) 858; AVX512BW-NEXT: vzeroupper 859; AVX512BW-NEXT: retq 860; 861; AVX512BW-FCP-LABEL: store_i8_stride5_vf8: 862; AVX512BW-FCP: # %bb.0: 863; AVX512BW-FCP-NEXT: movq (%r8), %rax 864; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 865; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 866; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 867; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 868; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 869; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 870; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 871; AVX512BW-FCP-NEXT: vmovq %rax, %xmm3 872; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 873; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 874; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 875; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 876; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] 877; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 878; AVX512BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210 879; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 880; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} 881; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 882; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 883; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 884; AVX512BW-FCP-NEXT: shrq $48, %rax 885; AVX512BW-FCP-NEXT: vpbroadcastw %eax, %xmm1 886; AVX512BW-FCP-NEXT: movw $132, %ax 887; AVX512BW-FCP-NEXT: kmovd %eax, %k1 888; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 889; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 890; AVX512BW-FCP-NEXT: vmovq %xmm0, 32(%r9) 891; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%r9) 892; AVX512BW-FCP-NEXT: vzeroupper 893; AVX512BW-FCP-NEXT: retq 894; 895; AVX512DQ-BW-LABEL: store_i8_stride5_vf8: 896; AVX512DQ-BW: # %bb.0: 897; AVX512DQ-BW-NEXT: movq (%r8), %rax 898; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 899; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 900; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 901; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 902; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 903; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 904; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 905; AVX512DQ-BW-NEXT: vmovq %rax, %xmm3 906; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 907; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 908; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 909; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm2, %ymm2 910; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] 911; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] 912; AVX512DQ-BW-NEXT: movl $554189328, %ecx # imm = 0x21084210 913; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 914; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} 915; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 916; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 917; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 918; AVX512DQ-BW-NEXT: shrq $48, %rax 919; AVX512DQ-BW-NEXT: vpbroadcastw %eax, %xmm1 920; AVX512DQ-BW-NEXT: movw $132, %ax 921; AVX512DQ-BW-NEXT: kmovd %eax, %k1 922; AVX512DQ-BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 923; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 924; AVX512DQ-BW-NEXT: vmovq %xmm0, 32(%r9) 925; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%r9) 926; AVX512DQ-BW-NEXT: vzeroupper 927; AVX512DQ-BW-NEXT: retq 928; 929; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf8: 930; AVX512DQ-BW-FCP: # %bb.0: 931; AVX512DQ-BW-FCP-NEXT: movq (%r8), %rax 932; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 933; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 934; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 935; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 936; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 937; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 938; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 939; AVX512DQ-BW-FCP-NEXT: vmovq %rax, %xmm3 940; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 941; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 942; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] 943; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 944; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] 945; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 946; AVX512DQ-BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210 947; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 948; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} 949; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] 950; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] 951; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 952; AVX512DQ-BW-FCP-NEXT: shrq $48, %rax 953; AVX512DQ-BW-FCP-NEXT: vpbroadcastw %eax, %xmm1 954; AVX512DQ-BW-FCP-NEXT: movw $132, %ax 955; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 956; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 957; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 958; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 32(%r9) 959; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%r9) 960; AVX512DQ-BW-FCP-NEXT: vzeroupper 961; AVX512DQ-BW-FCP-NEXT: retq 962 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 963 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64 964 %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64 965 %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64 966 %in.vec4 = load <8 x i8>, ptr %in.vecptr4, align 64 967 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 968 %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 969 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 970 %4 = shufflevector <8 x i8> %in.vec4, <8 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 971 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <40 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39> 972 %interleaved.vec = shufflevector <40 x i8> %5, <40 x i8> poison, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39> 973 store <40 x i8> %interleaved.vec, ptr %out.vec, align 64 974 ret void 975} 976 977define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 978; SSE-LABEL: store_i8_stride5_vf16: 979; SSE: # %bb.0: 980; SSE-NEXT: movdqa (%rdi), %xmm12 981; SSE-NEXT: movdqa (%rsi), %xmm8 982; SSE-NEXT: movdqa (%rdx), %xmm9 983; SSE-NEXT: movdqa (%rcx), %xmm4 984; SSE-NEXT: movdqa (%r8), %xmm0 985; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] 986; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 987; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 988; SSE-NEXT: pand %xmm6, %xmm1 989; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] 990; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 991; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] 992; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 993; SSE-NEXT: movdqa %xmm6, %xmm5 994; SSE-NEXT: pandn %xmm3, %xmm5 995; SSE-NEXT: por %xmm1, %xmm5 996; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 997; SSE-NEXT: pand %xmm2, %xmm5 998; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,2,2] 999; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1000; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 1001; SSE-NEXT: pand %xmm1, %xmm7 1002; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] 1003; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] 1004; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1005; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] 1006; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0] 1007; SSE-NEXT: movdqa %xmm1, %xmm11 1008; SSE-NEXT: pandn %xmm10, %xmm11 1009; SSE-NEXT: por %xmm7, %xmm11 1010; SSE-NEXT: movdqa %xmm2, %xmm10 1011; SSE-NEXT: pandn %xmm11, %xmm10 1012; SSE-NEXT: por %xmm5, %xmm10 1013; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 1014; SSE-NEXT: pand %xmm7, %xmm10 1015; SSE-NEXT: movdqa %xmm0, %xmm5 1016; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2] 1017; SSE-NEXT: movdqa %xmm7, %xmm0 1018; SSE-NEXT: pandn %xmm11, %xmm0 1019; SSE-NEXT: por %xmm10, %xmm0 1020; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1021; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] 1022; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 1023; SSE-NEXT: pand %xmm12, %xmm11 1024; SSE-NEXT: movdqa %xmm8, %xmm0 1025; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 1026; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1027; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] 1028; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] 1029; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,4,7] 1030; SSE-NEXT: movdqa %xmm12, %xmm14 1031; SSE-NEXT: pandn %xmm13, %xmm14 1032; SSE-NEXT: por %xmm11, %xmm14 1033; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 1034; SSE-NEXT: movdqa %xmm13, %xmm11 1035; SSE-NEXT: pandn %xmm14, %xmm11 1036; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7] 1037; SSE-NEXT: movdqa %xmm9, %xmm10 1038; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] 1039; SSE-NEXT: movdqa %xmm1, %xmm15 1040; SSE-NEXT: pandn %xmm14, %xmm15 1041; SSE-NEXT: movdqa %xmm4, %xmm14 1042; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] 1043; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] 1044; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] 1045; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4] 1046; SSE-NEXT: pand %xmm1, %xmm0 1047; SSE-NEXT: por %xmm15, %xmm0 1048; SSE-NEXT: pand %xmm13, %xmm0 1049; SSE-NEXT: por %xmm11, %xmm0 1050; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] 1051; SSE-NEXT: movdqa %xmm6, %xmm11 1052; SSE-NEXT: pandn %xmm15, %xmm11 1053; SSE-NEXT: pand %xmm6, %xmm0 1054; SSE-NEXT: por %xmm0, %xmm11 1055; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1056; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3] 1057; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] 1058; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 1059; SSE-NEXT: movdqa %xmm7, %xmm15 1060; SSE-NEXT: pandn %xmm0, %xmm15 1061; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] 1062; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1063; SSE-NEXT: pand %xmm7, %xmm0 1064; SSE-NEXT: por %xmm0, %xmm15 1065; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 1066; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 1067; SSE-NEXT: movdqa %xmm1, %xmm3 1068; SSE-NEXT: pandn %xmm0, %xmm3 1069; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1070; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] 1071; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] 1072; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] 1073; SSE-NEXT: pand %xmm1, %xmm0 1074; SSE-NEXT: por %xmm3, %xmm0 1075; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 1076; SSE-NEXT: pand %xmm3, %xmm0 1077; SSE-NEXT: pandn %xmm15, %xmm3 1078; SSE-NEXT: por %xmm0, %xmm3 1079; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] 1080; SSE-NEXT: movdqa %xmm12, %xmm15 1081; SSE-NEXT: pandn %xmm0, %xmm15 1082; SSE-NEXT: pand %xmm12, %xmm3 1083; SSE-NEXT: por %xmm3, %xmm15 1084; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7] 1085; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1086; SSE-NEXT: pand %xmm12, %xmm0 1087; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7] 1088; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] 1089; SSE-NEXT: pandn %xmm3, %xmm12 1090; SSE-NEXT: por %xmm0, %xmm12 1091; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] 1092; SSE-NEXT: pand %xmm6, %xmm0 1093; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,2,2,4,5,6,7] 1094; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3] 1095; SSE-NEXT: pandn %xmm3, %xmm6 1096; SSE-NEXT: por %xmm0, %xmm6 1097; SSE-NEXT: pand %xmm13, %xmm6 1098; SSE-NEXT: pandn %xmm12, %xmm13 1099; SSE-NEXT: por %xmm6, %xmm13 1100; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] 1101; SSE-NEXT: movdqa %xmm1, %xmm6 1102; SSE-NEXT: pandn %xmm0, %xmm6 1103; SSE-NEXT: pand %xmm1, %xmm13 1104; SSE-NEXT: por %xmm13, %xmm6 1105; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7] 1106; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1107; SSE-NEXT: pand %xmm1, %xmm0 1108; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] 1109; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] 1110; SSE-NEXT: pandn %xmm3, %xmm1 1111; SSE-NEXT: por %xmm0, %xmm1 1112; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] 1113; SSE-NEXT: pand %xmm7, %xmm0 1114; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 1115; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] 1116; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] 1117; SSE-NEXT: pandn %xmm3, %xmm7 1118; SSE-NEXT: por %xmm0, %xmm7 1119; SSE-NEXT: pand %xmm2, %xmm7 1120; SSE-NEXT: pandn %xmm1, %xmm2 1121; SSE-NEXT: por %xmm7, %xmm2 1122; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 1123; SSE-NEXT: pand %xmm0, %xmm2 1124; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] 1125; SSE-NEXT: pandn %xmm1, %xmm0 1126; SSE-NEXT: por %xmm2, %xmm0 1127; SSE-NEXT: movdqa %xmm0, 64(%r9) 1128; SSE-NEXT: movdqa %xmm6, (%r9) 1129; SSE-NEXT: movdqa %xmm15, 16(%r9) 1130; SSE-NEXT: movdqa %xmm11, 48(%r9) 1131; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1132; SSE-NEXT: movaps %xmm0, 32(%r9) 1133; SSE-NEXT: retq 1134; 1135; AVX-LABEL: store_i8_stride5_vf16: 1136; AVX: # %bb.0: 1137; AVX-NEXT: vmovdqa (%rdi), %xmm2 1138; AVX-NEXT: vmovdqa (%rsi), %xmm3 1139; AVX-NEXT: vmovdqa (%rdx), %xmm1 1140; AVX-NEXT: vmovdqa (%rcx), %xmm4 1141; AVX-NEXT: vmovdqa (%r8), %xmm0 1142; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero 1143; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] 1144; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 1145; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u] 1146; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] 1147; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 1148; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255] 1149; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 1150; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15] 1151; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero 1152; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 1153; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 1154; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero 1155; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 1156; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero 1157; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 1158; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1159; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero 1160; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 1161; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6] 1162; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 1163; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero 1164; AVX-NEXT: vpor %xmm10, %xmm9, %xmm9 1165; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero 1166; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 1167; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12] 1168; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2 1169; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero 1170; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 1171; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero 1172; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 1173; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8] 1174; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 1175; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero 1176; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 1177; AVX-NEXT: vmovdqa %xmm1, 48(%r9) 1178; AVX-NEXT: vmovdqa %xmm2, 16(%r9) 1179; AVX-NEXT: vmovdqa %xmm9, (%r9) 1180; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1181; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0 1182; AVX-NEXT: vmovdqa %xmm0, 64(%r9) 1183; AVX-NEXT: vmovdqa %xmm5, 32(%r9) 1184; AVX-NEXT: retq 1185; 1186; AVX2-LABEL: store_i8_stride5_vf16: 1187; AVX2: # %bb.0: 1188; AVX2-NEXT: vmovdqa (%rdi), %xmm1 1189; AVX2-NEXT: vmovdqa (%rdx), %xmm2 1190; AVX2-NEXT: vmovdqa (%r8), %xmm0 1191; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1192; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1193; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] 1194; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] 1195; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero 1196; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 1197; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero 1198; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] 1199; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero 1200; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 1201; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 1202; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 1203; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] 1204; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 1205; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 1206; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 1207; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] 1208; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero 1209; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] 1210; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero 1211; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 1212; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] 1213; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 1214; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] 1215; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] 1216; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 1217; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 1218; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] 1219; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 1220; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 1221; AVX2-NEXT: vmovdqa %ymm1, (%r9) 1222; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1223; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm0 1224; AVX2-NEXT: vmovdqa %xmm0, 64(%r9) 1225; AVX2-NEXT: vmovdqa %ymm3, 32(%r9) 1226; AVX2-NEXT: vzeroupper 1227; AVX2-NEXT: retq 1228; 1229; AVX2-FP-LABEL: store_i8_stride5_vf16: 1230; AVX2-FP: # %bb.0: 1231; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 1232; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 1233; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 1234; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1235; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1236; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] 1237; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] 1238; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero 1239; AVX2-FP-NEXT: vpor %ymm4, %ymm3, %ymm3 1240; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero 1241; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] 1242; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero 1243; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 1244; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 1245; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 1246; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] 1247; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 1248; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 1249; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 1250; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] 1251; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero 1252; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] 1253; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero 1254; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 1255; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] 1256; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero 1257; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] 1258; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] 1259; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 1260; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 1261; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] 1262; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 1263; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 1264; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9) 1265; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1266; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm0 1267; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9) 1268; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) 1269; AVX2-FP-NEXT: vzeroupper 1270; AVX2-FP-NEXT: retq 1271; 1272; AVX2-FCP-LABEL: store_i8_stride5_vf16: 1273; AVX2-FCP: # %bb.0: 1274; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 1275; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 1276; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0 1277; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1278; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1279; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] 1280; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,13],zero,zero,zero,xmm3[6,14],zero,zero,zero,xmm3[7,15],zero 1281; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] 1282; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero,zero,zero 1283; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 1284; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] 1285; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero 1286; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,2,0] 1287; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] 1288; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 1289; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1] 1290; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 1291; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 1292; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 1293; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7] 1294; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 1295; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,13],zero,zero,zero,ymm1[18,22],zero,zero,zero,ymm1[19,23],zero,zero,zero,ymm1[24,28],zero,zero 1296; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] 1297; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 1298; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,6],zero,zero,zero,ymm2[3,7],zero,zero,zero,ymm2[8,12],zero,zero,zero,ymm2[9,17],zero,zero,zero,ymm2[22,18],zero,zero,zero,ymm2[23,19],zero,zero,zero,ymm2[24,28] 1299; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 1300; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,1,2,2,2,2,2,2] 1301; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 1302; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 1303; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 1304; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) 1305; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) 1306; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1307; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 1308; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 1309; AVX2-FCP-NEXT: vzeroupper 1310; AVX2-FCP-NEXT: retq 1311; 1312; AVX512-LABEL: store_i8_stride5_vf16: 1313; AVX512: # %bb.0: 1314; AVX512-NEXT: vmovdqa (%rdi), %xmm1 1315; AVX512-NEXT: vmovdqa (%rdx), %xmm2 1316; AVX512-NEXT: vmovdqa (%r8), %xmm0 1317; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1318; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1319; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] 1320; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] 1321; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] 1322; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 1323; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) 1324; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] 1325; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero 1326; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1327; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] 1328; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] 1329; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero 1330; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) 1331; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] 1332; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] 1333; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 1334; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 1335; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] 1336; AVX512-NEXT: vpermd %zmm0, %zmm4, %zmm4 1337; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) 1338; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] 1339; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] 1340; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1341; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] 1342; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 1343; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero 1344; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1345; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 1346; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) 1347; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) 1348; AVX512-NEXT: vzeroupper 1349; AVX512-NEXT: retq 1350; 1351; AVX512-FCP-LABEL: store_i8_stride5_vf16: 1352; AVX512-FCP: # %bb.0: 1353; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 1354; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 1355; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 1356; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1357; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1358; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] 1359; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero 1360; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] 1361; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 1362; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero 1363; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1364; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] 1365; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] 1366; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] 1367; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 1368; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] 1369; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 1370; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 1371; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 1372; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] 1373; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 1374; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) 1375; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] 1376; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] 1377; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1378; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] 1379; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 1380; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero 1381; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1382; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 1383; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 1384; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 1385; AVX512-FCP-NEXT: vzeroupper 1386; AVX512-FCP-NEXT: retq 1387; 1388; AVX512DQ-LABEL: store_i8_stride5_vf16: 1389; AVX512DQ: # %bb.0: 1390; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 1391; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 1392; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 1393; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1394; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1395; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] 1396; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] 1397; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] 1398; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 1399; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) 1400; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] 1401; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero 1402; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1403; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] 1404; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] 1405; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero 1406; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) 1407; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] 1408; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] 1409; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 1410; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3 1411; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] 1412; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm4 1413; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) 1414; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] 1415; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] 1416; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1417; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] 1418; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 1419; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero 1420; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1421; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 1422; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) 1423; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) 1424; AVX512DQ-NEXT: vzeroupper 1425; AVX512DQ-NEXT: retq 1426; 1427; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16: 1428; AVX512DQ-FCP: # %bb.0: 1429; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 1430; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 1431; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 1432; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 1433; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 1434; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] 1435; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero 1436; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] 1437; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 1438; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero 1439; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1440; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] 1441; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] 1442; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] 1443; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 1444; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] 1445; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 1446; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 1447; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 1448; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] 1449; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 1450; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) 1451; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] 1452; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] 1453; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1454; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] 1455; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 1456; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero 1457; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] 1458; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 1459; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 1460; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 1461; AVX512DQ-FCP-NEXT: vzeroupper 1462; AVX512DQ-FCP-NEXT: retq 1463; 1464; AVX512BW-LABEL: store_i8_stride5_vf16: 1465; AVX512BW: # %bb.0: 1466; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1467; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 1468; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 1469; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1470; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1471; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero 1472; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] 1473; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero 1474; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 1475; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] 1476; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] 1477; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero 1478; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 1479; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 1480; AVX512BW-NEXT: kmovd %eax, %k1 1481; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} 1482; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] 1483; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] 1484; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] 1485; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero 1486; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 1487; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1488; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] 1489; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 1490; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 1491; AVX512BW-NEXT: kmovq %rax, %k1 1492; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 1493; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1494; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero 1495; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] 1496; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero 1497; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] 1498; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 1499; AVX512BW-NEXT: vmovdqa %xmm2, 64(%r9) 1500; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) 1501; AVX512BW-NEXT: vzeroupper 1502; AVX512BW-NEXT: retq 1503; 1504; AVX512BW-FCP-LABEL: store_i8_stride5_vf16: 1505; AVX512BW-FCP: # %bb.0: 1506; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1507; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 1508; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 1509; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1510; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1511; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7] 1512; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 1513; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] 1514; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1515; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] 1516; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] 1517; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 1518; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] 1519; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 1520; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero 1521; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 1522; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 1523; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] 1524; AVX512BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 1525; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 1526; AVX512BW-FCP-NEXT: kmovq %rax, %k1 1527; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 1528; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] 1529; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1530; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero 1531; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] 1532; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero 1533; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1 1534; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 1535; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 1536; AVX512BW-FCP-NEXT: vzeroupper 1537; AVX512BW-FCP-NEXT: retq 1538; 1539; AVX512DQ-BW-LABEL: store_i8_stride5_vf16: 1540; AVX512DQ-BW: # %bb.0: 1541; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 1542; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 1543; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 1544; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1545; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1546; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero 1547; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] 1548; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero 1549; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 1550; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] 1551; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] 1552; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero 1553; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 1554; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 1555; AVX512DQ-BW-NEXT: kmovd %eax, %k1 1556; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} 1557; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] 1558; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] 1559; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] 1560; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero 1561; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 1562; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 1563; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] 1564; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 1565; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 1566; AVX512DQ-BW-NEXT: kmovq %rax, %k1 1567; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 1568; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1569; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero 1570; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] 1571; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero 1572; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] 1573; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 1574; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 64(%r9) 1575; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) 1576; AVX512DQ-BW-NEXT: vzeroupper 1577; AVX512DQ-BW-NEXT: retq 1578; 1579; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf16: 1580; AVX512DQ-BW-FCP: # %bb.0: 1581; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 1582; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 1583; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 1584; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 1585; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 1586; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7] 1587; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 1588; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] 1589; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 1590; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] 1591; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] 1592; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 1593; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] 1594; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 1595; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero 1596; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 1597; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 1598; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] 1599; AVX512DQ-BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 1600; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 1601; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 1602; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 1603; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] 1604; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] 1605; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero 1606; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] 1607; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero 1608; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1 1609; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9) 1610; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 1611; AVX512DQ-BW-FCP-NEXT: vzeroupper 1612; AVX512DQ-BW-FCP-NEXT: retq 1613 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 1614 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 1615 %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64 1616 %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64 1617 %in.vec4 = load <16 x i8>, ptr %in.vecptr4, align 64 1618 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1619 %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1620 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1621 %4 = shufflevector <16 x i8> %in.vec4, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1622 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <80 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79> 1623 %interleaved.vec = shufflevector <80 x i8> %5, <80 x i8> poison, <80 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 1, i32 17, i32 33, i32 49, i32 65, i32 2, i32 18, i32 34, i32 50, i32 66, i32 3, i32 19, i32 35, i32 51, i32 67, i32 4, i32 20, i32 36, i32 52, i32 68, i32 5, i32 21, i32 37, i32 53, i32 69, i32 6, i32 22, i32 38, i32 54, i32 70, i32 7, i32 23, i32 39, i32 55, i32 71, i32 8, i32 24, i32 40, i32 56, i32 72, i32 9, i32 25, i32 41, i32 57, i32 73, i32 10, i32 26, i32 42, i32 58, i32 74, i32 11, i32 27, i32 43, i32 59, i32 75, i32 12, i32 28, i32 44, i32 60, i32 76, i32 13, i32 29, i32 45, i32 61, i32 77, i32 14, i32 30, i32 46, i32 62, i32 78, i32 15, i32 31, i32 47, i32 63, i32 79> 1624 store <80 x i8> %interleaved.vec, ptr %out.vec, align 64 1625 ret void 1626} 1627 1628define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 1629; SSE-LABEL: store_i8_stride5_vf32: 1630; SSE: # %bb.0: 1631; SSE-NEXT: subq $152, %rsp 1632; SSE-NEXT: movdqa 16(%rdi), %xmm15 1633; SSE-NEXT: movdqa (%rsi), %xmm9 1634; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1635; SSE-NEXT: movdqa 16(%rsi), %xmm7 1636; SSE-NEXT: movdqa (%rdx), %xmm2 1637; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1638; SSE-NEXT: movdqa 16(%rdx), %xmm0 1639; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1640; SSE-NEXT: movdqa (%rcx), %xmm11 1641; SSE-NEXT: movdqa 16(%rcx), %xmm12 1642; SSE-NEXT: movdqa 16(%r8), %xmm14 1643; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 1644; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 1645; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 1646; SSE-NEXT: pand %xmm13, %xmm0 1647; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] 1648; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1649; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] 1650; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 1651; SSE-NEXT: movdqa %xmm13, %xmm4 1652; SSE-NEXT: pandn %xmm1, %xmm4 1653; SSE-NEXT: por %xmm0, %xmm4 1654; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 1655; SSE-NEXT: pand %xmm8, %xmm4 1656; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] 1657; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 1658; SSE-NEXT: pand %xmm3, %xmm0 1659; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] 1660; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 1661; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1662; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] 1663; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] 1664; SSE-NEXT: movdqa %xmm3, %xmm5 1665; SSE-NEXT: pandn %xmm1, %xmm5 1666; SSE-NEXT: por %xmm0, %xmm5 1667; SSE-NEXT: movdqa %xmm8, %xmm0 1668; SSE-NEXT: pandn %xmm5, %xmm0 1669; SSE-NEXT: por %xmm4, %xmm0 1670; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 1671; SSE-NEXT: pand %xmm10, %xmm0 1672; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] 1673; SSE-NEXT: movdqa %xmm10, %xmm4 1674; SSE-NEXT: pandn %xmm1, %xmm4 1675; SSE-NEXT: por %xmm0, %xmm4 1676; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1677; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] 1678; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 1679; SSE-NEXT: pand %xmm13, %xmm0 1680; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] 1681; SSE-NEXT: movdqa %xmm11, %xmm2 1682; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1683; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] 1684; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 1685; SSE-NEXT: movdqa %xmm13, %xmm4 1686; SSE-NEXT: pandn %xmm1, %xmm4 1687; SSE-NEXT: por %xmm0, %xmm4 1688; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] 1689; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1690; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1691; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] 1692; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] 1693; SSE-NEXT: movdqa %xmm3, %xmm1 1694; SSE-NEXT: pandn %xmm0, %xmm1 1695; SSE-NEXT: movdqa (%rdi), %xmm0 1696; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1697; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 1698; SSE-NEXT: pand %xmm3, %xmm0 1699; SSE-NEXT: por %xmm0, %xmm1 1700; SSE-NEXT: movdqa %xmm8, %xmm0 1701; SSE-NEXT: pandn %xmm1, %xmm0 1702; SSE-NEXT: pand %xmm8, %xmm4 1703; SSE-NEXT: por %xmm4, %xmm0 1704; SSE-NEXT: movdqa (%r8), %xmm1 1705; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1706; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 1707; SSE-NEXT: movdqa %xmm10, %xmm4 1708; SSE-NEXT: pandn %xmm1, %xmm4 1709; SSE-NEXT: pand %xmm10, %xmm0 1710; SSE-NEXT: por %xmm0, %xmm4 1711; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1712; SSE-NEXT: movdqa %xmm7, %xmm11 1713; SSE-NEXT: movdqa %xmm7, %xmm0 1714; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] 1715; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1716; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 1717; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 1718; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] 1719; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 1720; SSE-NEXT: movdqa %xmm9, %xmm4 1721; SSE-NEXT: pandn %xmm0, %xmm4 1722; SSE-NEXT: movdqa %xmm15, %xmm6 1723; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1724; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] 1725; SSE-NEXT: pand %xmm9, %xmm0 1726; SSE-NEXT: por %xmm0, %xmm4 1727; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 1728; SSE-NEXT: movdqa %xmm8, %xmm0 1729; SSE-NEXT: pandn %xmm4, %xmm0 1730; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 1731; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7] 1732; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] 1733; SSE-NEXT: movdqa %xmm3, %xmm5 1734; SSE-NEXT: pandn %xmm4, %xmm5 1735; SSE-NEXT: movdqa %xmm12, %xmm15 1736; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] 1737; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] 1738; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] 1739; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] 1740; SSE-NEXT: pand %xmm3, %xmm4 1741; SSE-NEXT: por %xmm5, %xmm4 1742; SSE-NEXT: pand %xmm8, %xmm4 1743; SSE-NEXT: por %xmm0, %xmm4 1744; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1745; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] 1746; SSE-NEXT: movdqa %xmm13, %xmm1 1747; SSE-NEXT: pandn %xmm0, %xmm1 1748; SSE-NEXT: pand %xmm13, %xmm4 1749; SSE-NEXT: por %xmm4, %xmm1 1750; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1751; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1752; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1753; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,2,3] 1754; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] 1755; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 1756; SSE-NEXT: movdqa %xmm10, %xmm4 1757; SSE-NEXT: pandn %xmm0, %xmm4 1758; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,2,2,3,4,5,6,7] 1759; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1760; SSE-NEXT: pand %xmm10, %xmm0 1761; SSE-NEXT: por %xmm0, %xmm4 1762; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 1763; SSE-NEXT: movdqa %xmm1, %xmm5 1764; SSE-NEXT: pandn %xmm4, %xmm5 1765; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] 1766; SSE-NEXT: movdqa %xmm3, %xmm7 1767; SSE-NEXT: pandn %xmm4, %xmm7 1768; SSE-NEXT: movdqa %xmm11, %xmm0 1769; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 1770; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 1771; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] 1772; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] 1773; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] 1774; SSE-NEXT: pand %xmm3, %xmm4 1775; SSE-NEXT: por %xmm7, %xmm4 1776; SSE-NEXT: pand %xmm1, %xmm4 1777; SSE-NEXT: por %xmm5, %xmm4 1778; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] 1779; SSE-NEXT: movdqa %xmm9, %xmm0 1780; SSE-NEXT: pandn %xmm5, %xmm0 1781; SSE-NEXT: pand %xmm9, %xmm4 1782; SSE-NEXT: por %xmm4, %xmm0 1783; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1784; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 1785; SSE-NEXT: movdqa %xmm11, %xmm0 1786; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] 1787; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1788; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] 1789; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] 1790; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,7] 1791; SSE-NEXT: movdqa %xmm9, %xmm5 1792; SSE-NEXT: pandn %xmm4, %xmm5 1793; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1794; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] 1795; SSE-NEXT: pand %xmm9, %xmm4 1796; SSE-NEXT: por %xmm4, %xmm5 1797; SSE-NEXT: movdqa %xmm8, %xmm4 1798; SSE-NEXT: pandn %xmm5, %xmm4 1799; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1800; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,5,6,6,7] 1801; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] 1802; SSE-NEXT: movdqa %xmm3, %xmm7 1803; SSE-NEXT: pandn %xmm5, %xmm7 1804; SSE-NEXT: movdqa %xmm2, %xmm12 1805; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] 1806; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,1,2,1] 1807; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] 1808; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] 1809; SSE-NEXT: pand %xmm3, %xmm14 1810; SSE-NEXT: por %xmm7, %xmm14 1811; SSE-NEXT: pand %xmm8, %xmm14 1812; SSE-NEXT: por %xmm4, %xmm14 1813; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1814; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] 1815; SSE-NEXT: movdqa %xmm13, %xmm7 1816; SSE-NEXT: pandn %xmm4, %xmm7 1817; SSE-NEXT: pand %xmm13, %xmm14 1818; SSE-NEXT: por %xmm14, %xmm7 1819; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1820; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1821; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1822; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] 1823; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7] 1824; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 1825; SSE-NEXT: movdqa %xmm10, %xmm14 1826; SSE-NEXT: pandn %xmm4, %xmm14 1827; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7] 1828; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1829; SSE-NEXT: pand %xmm10, %xmm4 1830; SSE-NEXT: por %xmm4, %xmm14 1831; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] 1832; SSE-NEXT: movdqa %xmm3, %xmm2 1833; SSE-NEXT: pandn %xmm4, %xmm2 1834; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1835; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1836; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] 1837; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] 1838; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] 1839; SSE-NEXT: pand %xmm3, %xmm4 1840; SSE-NEXT: por %xmm2, %xmm4 1841; SSE-NEXT: pand %xmm1, %xmm4 1842; SSE-NEXT: pandn %xmm14, %xmm1 1843; SSE-NEXT: por %xmm4, %xmm1 1844; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] 1845; SSE-NEXT: movdqa %xmm9, %xmm7 1846; SSE-NEXT: pandn %xmm2, %xmm7 1847; SSE-NEXT: pand %xmm9, %xmm1 1848; SSE-NEXT: por %xmm1, %xmm7 1849; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,7,6,7] 1850; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2] 1851; SSE-NEXT: movdqa %xmm3, %xmm2 1852; SSE-NEXT: pandn %xmm0, %xmm2 1853; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1854; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,6,6,7] 1855; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1856; SSE-NEXT: pand %xmm3, %xmm0 1857; SSE-NEXT: por %xmm0, %xmm2 1858; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 1859; SSE-NEXT: movdqa %xmm6, %xmm0 1860; SSE-NEXT: pandn %xmm2, %xmm0 1861; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 1862; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,6] 1863; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2] 1864; SSE-NEXT: movdqa %xmm10, %xmm14 1865; SSE-NEXT: pandn %xmm2, %xmm14 1866; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 1867; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] 1868; SSE-NEXT: pand %xmm10, %xmm2 1869; SSE-NEXT: por %xmm2, %xmm14 1870; SSE-NEXT: pand %xmm6, %xmm14 1871; SSE-NEXT: por %xmm0, %xmm14 1872; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 1873; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1874; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 1875; SSE-NEXT: movdqa %xmm11, %xmm15 1876; SSE-NEXT: pandn %xmm0, %xmm15 1877; SSE-NEXT: pand %xmm11, %xmm14 1878; SSE-NEXT: por %xmm14, %xmm15 1879; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1880; SSE-NEXT: # xmm0 = mem[1,0,2,3,4,5,6,7] 1881; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1882; SSE-NEXT: movdqa %xmm9, %xmm2 1883; SSE-NEXT: pandn %xmm0, %xmm2 1884; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7] 1885; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1886; SSE-NEXT: pand %xmm9, %xmm0 1887; SSE-NEXT: por %xmm0, %xmm2 1888; SSE-NEXT: movdqa %xmm8, %xmm0 1889; SSE-NEXT: pandn %xmm2, %xmm0 1890; SSE-NEXT: pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload 1891; SSE-NEXT: # xmm2 = mem[0,1,2,2,4,5,6,7] 1892; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] 1893; SSE-NEXT: movdqa %xmm13, %xmm14 1894; SSE-NEXT: pandn %xmm2, %xmm14 1895; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] 1896; SSE-NEXT: pand %xmm13, %xmm2 1897; SSE-NEXT: por %xmm2, %xmm14 1898; SSE-NEXT: pand %xmm8, %xmm14 1899; SSE-NEXT: por %xmm0, %xmm14 1900; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 1901; SSE-NEXT: movdqa %xmm3, %xmm0 1902; SSE-NEXT: pandn %xmm2, %xmm0 1903; SSE-NEXT: pand %xmm3, %xmm14 1904; SSE-NEXT: por %xmm14, %xmm0 1905; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7] 1906; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] 1907; SSE-NEXT: movdqa %xmm3, %xmm5 1908; SSE-NEXT: pandn %xmm2, %xmm5 1909; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 1910; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,6,6,7] 1911; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] 1912; SSE-NEXT: pand %xmm3, %xmm2 1913; SSE-NEXT: por %xmm2, %xmm5 1914; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 1915; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] 1916; SSE-NEXT: pand %xmm10, %xmm2 1917; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 1918; SSE-NEXT: # xmm12 = mem[0,1,2,3,7,5,6,6] 1919; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2] 1920; SSE-NEXT: pandn %xmm12, %xmm10 1921; SSE-NEXT: por %xmm2, %xmm10 1922; SSE-NEXT: movdqa %xmm6, %xmm1 1923; SSE-NEXT: pand %xmm6, %xmm10 1924; SSE-NEXT: pandn %xmm5, %xmm1 1925; SSE-NEXT: por %xmm10, %xmm1 1926; SSE-NEXT: pand %xmm11, %xmm1 1927; SSE-NEXT: movdqa %xmm1, %xmm5 1928; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1929; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 1930; SSE-NEXT: pandn %xmm2, %xmm11 1931; SSE-NEXT: por %xmm5, %xmm11 1932; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7] 1933; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1934; SSE-NEXT: pand %xmm9, %xmm2 1935; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 1936; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] 1937; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] 1938; SSE-NEXT: pandn %xmm5, %xmm9 1939; SSE-NEXT: por %xmm2, %xmm9 1940; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] 1941; SSE-NEXT: pand %xmm13, %xmm2 1942; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 1943; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] 1944; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] 1945; SSE-NEXT: pandn %xmm5, %xmm13 1946; SSE-NEXT: por %xmm2, %xmm13 1947; SSE-NEXT: pand %xmm8, %xmm13 1948; SSE-NEXT: pandn %xmm9, %xmm8 1949; SSE-NEXT: por %xmm13, %xmm8 1950; SSE-NEXT: pand %xmm3, %xmm8 1951; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 1952; SSE-NEXT: pandn %xmm2, %xmm3 1953; SSE-NEXT: por %xmm8, %xmm3 1954; SSE-NEXT: movdqa %xmm3, (%r9) 1955; SSE-NEXT: movdqa %xmm11, 64(%r9) 1956; SSE-NEXT: movdqa %xmm0, 80(%r9) 1957; SSE-NEXT: movdqa %xmm15, 144(%r9) 1958; SSE-NEXT: movdqa %xmm7, 16(%r9) 1959; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1960; SSE-NEXT: movaps %xmm0, 48(%r9) 1961; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1962; SSE-NEXT: movaps %xmm0, 96(%r9) 1963; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1964; SSE-NEXT: movaps %xmm0, 128(%r9) 1965; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1966; SSE-NEXT: movaps %xmm0, 32(%r9) 1967; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1968; SSE-NEXT: movaps %xmm0, 112(%r9) 1969; SSE-NEXT: addq $152, %rsp 1970; SSE-NEXT: retq 1971; 1972; AVX-LABEL: store_i8_stride5_vf32: 1973; AVX: # %bb.0: 1974; AVX-NEXT: vmovdqa 16(%rsi), %xmm10 1975; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 1976; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] 1977; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] 1978; AVX-NEXT: # xmm5 = mem[0,0] 1979; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm1 1980; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] 1981; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1982; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 1983; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm2 1984; AVX-NEXT: vmovdqa 16(%rcx), %xmm0 1985; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 1986; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1987; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] 1988; AVX-NEXT: # xmm4 = mem[0,0] 1989; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 1990; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1991; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] 1992; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 1993; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3 1994; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 1995; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 1996; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4],zero,xmm3[6,7,8,9],zero,xmm3[11,12,13,14],zero 1997; AVX-NEXT: vmovdqa 16(%r8), %xmm15 1998; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] 1999; AVX-NEXT: vpshufb %xmm12, %xmm15, %xmm6 2000; AVX-NEXT: vpor %xmm6, %xmm3, %xmm3 2001; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2002; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] 2003; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[9],zero,zero,zero,zero,xmm15[10],zero,zero,zero,zero,xmm15[11],zero,zero,zero,zero 2004; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 2005; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2006; AVX-NEXT: vmovdqa (%rcx), %xmm7 2007; AVX-NEXT: vmovdqa (%rdx), %xmm8 2008; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] 2009; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2010; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2011; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] 2012; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 2013; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] 2014; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] 2015; AVX-NEXT: vmovdqa (%rsi), %xmm2 2016; AVX-NEXT: vmovdqa (%rdi), %xmm3 2017; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 2018; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2019; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm5 2020; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 2021; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 2022; AVX-NEXT: vandnps %ymm14, %ymm13, %ymm14 2023; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 2024; AVX-NEXT: vorps %ymm5, %ymm14, %ymm5 2025; AVX-NEXT: vextractf128 $1, %ymm5, %xmm14 2026; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3],zero,xmm14[5,6,7,8],zero,xmm14[10,11,12,13],zero,xmm14[15] 2027; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm15[0],zero,zero,zero,zero,xmm15[1],zero,zero,zero,zero,xmm15[2],zero 2028; AVX-NEXT: vpor %xmm9, %xmm14, %xmm4 2029; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2030; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[1,2,3,4],zero,xmm5[6,7,8,9],zero,xmm5[11,12,13,14],zero 2031; AVX-NEXT: vmovdqa (%r8), %xmm14 2032; AVX-NEXT: vpshufb %xmm12, %xmm14, %xmm9 2033; AVX-NEXT: vpor %xmm5, %xmm9, %xmm4 2034; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2035; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero 2036; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] 2037; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 2038; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] 2039; AVX-NEXT: # xmm9 = mem[0,0] 2040; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm5 2041; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 2042; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u] 2043; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] 2044; AVX-NEXT: # xmm6 = mem[0,0] 2045; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm12 2046; AVX-NEXT: vpor %xmm5, %xmm12, %xmm5 2047; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] 2048; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] 2049; AVX-NEXT: # xmm12 = mem[0,0] 2050; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm10 2051; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 2052; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 2053; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 2054; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5 2055; AVX-NEXT: vorps %ymm1, %ymm5, %ymm5 2056; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] 2057; AVX-NEXT: vpshufb %xmm10, %xmm5, %xmm1 2058; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] 2059; AVX-NEXT: vpshufb %xmm4, %xmm15, %xmm11 2060; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 2061; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 2062; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] 2063; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm0 2064; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] 2065; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm15 2066; AVX-NEXT: vpor %xmm0, %xmm15, %xmm15 2067; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 2068; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9 2069; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] 2070; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 2071; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 2072; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm9 2073; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 2074; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] 2075; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 2076; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 2077; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 2078; AVX-NEXT: vandps %ymm12, %ymm9, %ymm9 2079; AVX-NEXT: vorps %ymm0, %ymm9, %ymm9 2080; AVX-NEXT: vextractf128 $1, %ymm9, %xmm0 2081; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm0 2082; AVX-NEXT: vpshufb %xmm4, %xmm14, %xmm4 2083; AVX-NEXT: vpor %xmm4, %xmm0, %xmm0 2084; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[0,1,2,3],zero,xmm9[5,6,7,8],zero,xmm9[10,11,12,13],zero,xmm9[15] 2085; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero 2086; AVX-NEXT: vpor %xmm4, %xmm9, %xmm4 2087; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] 2088; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 2089; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 2090; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2091; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] 2092; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 2093; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm7[6,u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero 2094; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] 2095; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 2096; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] 2097; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] 2098; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 2099; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 2100; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3 2101; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 2102; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm3 2103; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm5 2104; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 2105; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 2106; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] 2107; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero 2108; AVX-NEXT: vpor %xmm5, %xmm2, %xmm2 2109; AVX-NEXT: vmovdqa %xmm2, 48(%r9) 2110; AVX-NEXT: vmovdqa %xmm3, 32(%r9) 2111; AVX-NEXT: vmovdqa %xmm4, (%r9) 2112; AVX-NEXT: vmovdqa %xmm0, 16(%r9) 2113; AVX-NEXT: vmovdqa %xmm15, 112(%r9) 2114; AVX-NEXT: vmovdqa %xmm1, 96(%r9) 2115; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2116; AVX-NEXT: vmovaps %xmm0, 64(%r9) 2117; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2118; AVX-NEXT: vmovaps %xmm0, 80(%r9) 2119; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2120; AVX-NEXT: vmovaps %xmm0, 128(%r9) 2121; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2122; AVX-NEXT: vmovaps %xmm0, 144(%r9) 2123; AVX-NEXT: vzeroupper 2124; AVX-NEXT: retq 2125; 2126; AVX2-LABEL: store_i8_stride5_vf32: 2127; AVX2: # %bb.0: 2128; AVX2-NEXT: vmovdqa (%rdi), %ymm3 2129; AVX2-NEXT: vmovdqa (%rsi), %ymm4 2130; AVX2-NEXT: vmovdqa (%rdx), %ymm1 2131; AVX2-NEXT: vmovdqa (%rcx), %ymm2 2132; AVX2-NEXT: vmovdqa (%r8), %ymm0 2133; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] 2134; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 2135; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,3,3,6,6,7,7] 2136; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] 2137; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 2138; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 2139; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] 2140; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] 2141; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 2142; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] 2143; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] 2144; AVX2-NEXT: # ymm8 = mem[0,1,0,1] 2145; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 2146; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] 2147; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] 2148; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 2149; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,2,3,3,6,6,7,7] 2150; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] 2151; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 2152; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 2153; AVX2-NEXT: vmovdqa (%rsi), %xmm7 2154; AVX2-NEXT: vmovdqa (%rdi), %xmm8 2155; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 2156; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2157; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 2158; AVX2-NEXT: vmovdqa (%rdx), %xmm9 2159; AVX2-NEXT: vmovdqa (%rcx), %xmm10 2160; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 2161; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2162; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] 2163; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] 2164; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 2165; AVX2-NEXT: vmovdqa (%r8), %xmm11 2166; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] 2167; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] 2168; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 2169; AVX2-NEXT: vpblendvb %ymm13, %ymm6, %ymm12, %ymm6 2170; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm10[6],zero,xmm10[8,u],zero,xmm10[7],zero,xmm10[9],zero,xmm10[11,u],zero,xmm10[10],zero,xmm10[12] 2171; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero 2172; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 2173; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] 2174; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[u],zero,xmm8[u,10],zero,xmm8[12],zero,xmm8[u,11] 2175; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero 2176; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 2177; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] 2178; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 2179; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm7, %ymm7 2180; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] 2181; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] 2182; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 2183; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 2184; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] 2185; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero 2186; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 2187; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2188; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2189; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] 2190; AVX2-NEXT: vpor %ymm9, %ymm10, %ymm9 2191; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2192; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] 2193; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 2194; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] 2195; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] 2196; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2197; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 2198; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] 2199; AVX2-NEXT: vpermd %ymm3, %ymm9, %ymm3 2200; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 2201; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] 2202; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 2203; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 2204; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero 2205; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 2206; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] 2207; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 2208; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] 2209; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 2210; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 2211; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 2212; AVX2-NEXT: vmovdqa %ymm0, 64(%r9) 2213; AVX2-NEXT: vmovdqa %ymm8, 96(%r9) 2214; AVX2-NEXT: vmovdqa %ymm5, 128(%r9) 2215; AVX2-NEXT: vmovdqa %ymm7, 32(%r9) 2216; AVX2-NEXT: vmovdqa %ymm6, (%r9) 2217; AVX2-NEXT: vzeroupper 2218; AVX2-NEXT: retq 2219; 2220; AVX2-FP-LABEL: store_i8_stride5_vf32: 2221; AVX2-FP: # %bb.0: 2222; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 2223; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm4 2224; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 2225; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 2226; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 2227; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 2228; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm7 2229; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2230; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2231; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 2232; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm8 2233; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 2234; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 2235; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2236; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] 2237; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] 2238; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 2239; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10 2240; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] 2241; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] 2242; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 2243; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 2244; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] 2245; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero 2246; AVX2-FP-NEXT: vpor %xmm9, %xmm8, %xmm8 2247; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] 2248; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11] 2249; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero 2250; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 2251; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 2252; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 2253; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 2254; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] 2255; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] 2256; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 2257; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 2258; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] 2259; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero 2260; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 2261; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 2262; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2263; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] 2264; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 2265; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2266; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] 2267; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 2268; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] 2269; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] 2270; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2271; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 2272; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] 2273; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero 2274; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 2275; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2276; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero 2277; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] 2278; AVX2-FP-NEXT: vpor %ymm9, %ymm10, %ymm9 2279; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2280; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] 2281; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 2282; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7] 2283; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2284; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 2285; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 2286; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] 2287; AVX2-FP-NEXT: vpermd %ymm3, %ymm9, %ymm3 2288; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 2289; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] 2290; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 2291; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 2292; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero 2293; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 2294; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] 2295; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 2296; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] 2297; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0 2298; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 2299; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 2300; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) 2301; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%r9) 2302; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%r9) 2303; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) 2304; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) 2305; AVX2-FP-NEXT: vzeroupper 2306; AVX2-FP-NEXT: retq 2307; 2308; AVX2-FCP-LABEL: store_i8_stride5_vf32: 2309; AVX2-FCP: # %bb.0: 2310; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 2311; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4 2312; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 2313; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 2314; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 2315; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 2316; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 2317; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 2318; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2319; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 2320; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm8 2321; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm9 2322; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 2323; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2324; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] 2325; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] 2326; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 2327; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1] 2328; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 2329; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 2330; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 2331; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] 2332; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero 2333; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 2334; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] 2335; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11] 2336; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero 2337; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 2338; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 2339; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 2340; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 2341; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2] 2342; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm7 2343; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 2344; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 2345; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] 2346; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero 2347; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 2348; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 2349; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2350; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] 2351; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 2352; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2353; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] 2354; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 2355; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] 2356; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm8 2357; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2358; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 2359; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] 2360; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero 2361; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 2362; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2363; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero 2364; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] 2365; AVX2-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 2366; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2367; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] 2368; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 2369; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] 2370; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm9 2371; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 2372; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 2373; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] 2374; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3 2375; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 2376; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] 2377; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 2378; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 2379; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero 2380; AVX2-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 2381; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] 2382; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 2383; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] 2384; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 2385; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 2386; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 2387; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) 2388; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%r9) 2389; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%r9) 2390; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) 2391; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) 2392; AVX2-FCP-NEXT: vzeroupper 2393; AVX2-FCP-NEXT: retq 2394; 2395; AVX512-LABEL: store_i8_stride5_vf32: 2396; AVX512: # %bb.0: 2397; AVX512-NEXT: vmovdqa (%rdi), %ymm3 2398; AVX512-NEXT: vmovdqa (%rsi), %ymm4 2399; AVX512-NEXT: vmovdqa (%rdx), %ymm1 2400; AVX512-NEXT: vmovdqa (%rcx), %ymm2 2401; AVX512-NEXT: vmovdqa (%r8), %ymm0 2402; AVX512-NEXT: vmovdqa (%rdi), %xmm5 2403; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] 2404; AVX512-NEXT: vmovdqa (%rsi), %xmm7 2405; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero 2406; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6 2407; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 2408; AVX512-NEXT: vmovdqa (%rcx), %xmm8 2409; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] 2410; AVX512-NEXT: vmovdqa (%rdx), %xmm10 2411; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero 2412; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9 2413; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] 2414; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 2415; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6)) 2416; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 2417; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 2418; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2419; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] 2420; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 2421; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2422; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 2423; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 2424; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm8 ^ (ymm7 & (ymm5 ^ ymm8)) 2425; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 2426; AVX512-NEXT: vmovdqa (%r8), %xmm6 2427; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] 2428; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm6 2429; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) 2430; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u] 2431; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u] 2432; AVX512-NEXT: vpor %ymm5, %ymm8, %ymm5 2433; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero 2434; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19] 2435; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 2436; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5)) 2437; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero 2438; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u] 2439; AVX512-NEXT: vpor %ymm5, %ymm9, %ymm5 2440; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] 2441; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2442; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] 2443; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm9 2444; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2445; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) 2446; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 2447; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] 2448; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero 2449; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] 2450; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] 2451; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2452; AVX512-NEXT: vpandn %ymm9, %ymm10, %ymm9 2453; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 2454; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) 2455; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero 2456; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30] 2457; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 2458; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 2459; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero 2460; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u] 2461; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 2462; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 2463; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) 2464; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] 2465; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2466; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2467; AVX512-NEXT: vmovdqa %ymm0, 128(%r9) 2468; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r9) 2469; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) 2470; AVX512-NEXT: vzeroupper 2471; AVX512-NEXT: retq 2472; 2473; AVX512-FCP-LABEL: store_i8_stride5_vf32: 2474; AVX512-FCP: # %bb.0: 2475; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 2476; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 2477; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 2478; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 2479; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 2480; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] 2481; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm6 2482; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero 2483; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 2484; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 2485; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7 2486; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] 2487; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9 2488; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero 2489; AVX512-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 2490; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] 2491; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 2492; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) 2493; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 2494; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 2495; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2496; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] 2497; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 2498; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2499; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 2500; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 2501; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm7 ^ (ymm6 & (ymm4 ^ ymm7)) 2502; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7] 2503; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] 2504; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] 2505; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 2506; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) 2507; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] 2508; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] 2509; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 2510; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero 2511; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] 2512; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 2513; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) 2514; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero 2515; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] 2516; AVX512-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5 2517; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] 2518; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero 2519; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] 2520; AVX512-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 2521; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2522; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) 2523; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 2524; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] 2525; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,0,5,5,5,5,0,6] 2526; AVX512-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 2527; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2528; AVX512-FCP-NEXT: vpandn %ymm8, %ymm9, %ymm8 2529; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero 2530; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 2531; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) 2532; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero 2533; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] 2534; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 2535; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 2536; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero 2537; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u] 2538; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 2539; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2540; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2)) 2541; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7] 2542; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 2543; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 2544; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) 2545; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) 2546; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) 2547; AVX512-FCP-NEXT: vzeroupper 2548; AVX512-FCP-NEXT: retq 2549; 2550; AVX512DQ-LABEL: store_i8_stride5_vf32: 2551; AVX512DQ: # %bb.0: 2552; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 2553; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 2554; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 2555; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 2556; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 2557; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 2558; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] 2559; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 2560; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero 2561; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6 2562; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 2563; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 2564; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] 2565; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm10 2566; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero 2567; AVX512DQ-NEXT: vpor %xmm9, %xmm11, %xmm9 2568; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] 2569; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 2570; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6)) 2571; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 2572; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] 2573; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2574; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] 2575; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 2576; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2577; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 2578; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 2579; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm8 ^ (ymm7 & (ymm5 ^ ymm8)) 2580; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] 2581; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 2582; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] 2583; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm6 2584; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) 2585; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u] 2586; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u] 2587; AVX512DQ-NEXT: vpor %ymm5, %ymm8, %ymm5 2588; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero 2589; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19] 2590; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 2591; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5)) 2592; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero 2593; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u] 2594; AVX512DQ-NEXT: vpor %ymm5, %ymm9, %ymm5 2595; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] 2596; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2597; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] 2598; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm9 2599; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2600; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) 2601; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 2602; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] 2603; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero 2604; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] 2605; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] 2606; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2607; AVX512DQ-NEXT: vpandn %ymm9, %ymm10, %ymm9 2608; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 2609; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) 2610; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero 2611; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30] 2612; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 2613; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 2614; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero 2615; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u] 2616; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 2617; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 2618; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) 2619; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] 2620; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2621; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 2622; AVX512DQ-NEXT: vmovdqa %ymm0, 128(%r9) 2623; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r9) 2624; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) 2625; AVX512DQ-NEXT: vzeroupper 2626; AVX512DQ-NEXT: retq 2627; 2628; AVX512DQ-FCP-LABEL: store_i8_stride5_vf32: 2629; AVX512DQ-FCP: # %bb.0: 2630; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 2631; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 2632; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 2633; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 2634; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 2635; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] 2636; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm6 2637; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero 2638; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 2639; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 2640; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7 2641; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] 2642; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm9 2643; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero 2644; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 2645; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] 2646; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 2647; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) 2648; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 2649; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 2650; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2651; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] 2652; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 2653; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2654; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 2655; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 2656; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm7 ^ (ymm6 & (ymm4 ^ ymm7)) 2657; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7] 2658; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] 2659; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] 2660; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 2661; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) 2662; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] 2663; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] 2664; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 2665; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero 2666; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] 2667; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 2668; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) 2669; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero 2670; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] 2671; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5 2672; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] 2673; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero 2674; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] 2675; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 2676; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 2677; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) 2678; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 2679; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] 2680; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,0,5,5,5,5,0,6] 2681; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 2682; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 2683; AVX512DQ-FCP-NEXT: vpandn %ymm8, %ymm9, %ymm8 2684; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero 2685; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 2686; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) 2687; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero 2688; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] 2689; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 2690; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 2691; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero 2692; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u] 2693; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 2694; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2695; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2)) 2696; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7] 2697; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 2698; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) 2699; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) 2700; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) 2701; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) 2702; AVX512DQ-FCP-NEXT: vzeroupper 2703; AVX512DQ-FCP-NEXT: retq 2704; 2705; AVX512BW-LABEL: store_i8_stride5_vf32: 2706; AVX512BW: # %bb.0: 2707; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 2708; AVX512BW-NEXT: vmovdqa (%rsi), %ymm5 2709; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 2710; AVX512BW-NEXT: vmovdqa (%rcx), %ymm2 2711; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 2712; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 2713; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] 2714; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 2715; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero 2716; AVX512BW-NEXT: vpor %xmm6, %xmm8, %xmm6 2717; AVX512BW-NEXT: vmovdqa (%rdx), %xmm8 2718; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 2719; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 2720; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2721; AVX512BW-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 2722; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] 2723; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] 2724; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero 2725; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 2726; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 2727; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2728; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3 2729; AVX512BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] 2730; AVX512BW-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C 2731; AVX512BW-NEXT: kmovq %rax, %k1 2732; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} 2733; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 2734; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] 2735; AVX512BW-NEXT: vpermd %zmm6, %zmm7, %zmm6 2736; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 2737; AVX512BW-NEXT: kmovq %rax, %k1 2738; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} 2739; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero 2740; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 2741; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm6 2742; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] 2743; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2744; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 2745; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 2746; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 2747; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] 2748; AVX512BW-NEXT: vpermd %ymm4, %ymm7, %ymm7 2749; AVX512BW-NEXT: movl $138547332, %eax # imm = 0x8421084 2750; AVX512BW-NEXT: kmovd %eax, %k1 2751; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 2752; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero 2753; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero 2754; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 2755; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2756; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 2757; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 2758; AVX512BW-NEXT: kmovq %rax, %k1 2759; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 2760; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,3,3,3,0,4,4,4] 2761; AVX512BW-NEXT: vpermd %ymm0, %ymm6, %ymm6 2762; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] 2763; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] 2764; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 2765; AVX512BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 2766; AVX512BW-NEXT: kmovq %rax, %k1 2767; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 2768; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] 2769; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 2770; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] 2771; AVX512BW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 2772; AVX512BW-NEXT: kmovd %eax, %k1 2773; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm5 {%k1} 2774; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3] 2775; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] 2776; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 2777; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] 2778; AVX512BW-NEXT: movl $693250386, %eax # imm = 0x29522952 2779; AVX512BW-NEXT: kmovd %eax, %k1 2780; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} 2781; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,3,3] 2782; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C 2783; AVX512BW-NEXT: kmovd %eax, %k1 2784; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} 2785; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] 2786; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2787; AVX512BW-NEXT: movl $-2078209982, %eax # imm = 0x84210842 2788; AVX512BW-NEXT: kmovd %eax, %k1 2789; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 2790; AVX512BW-NEXT: vmovdqa %ymm1, 128(%r9) 2791; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r9) 2792; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) 2793; AVX512BW-NEXT: vzeroupper 2794; AVX512BW-NEXT: retq 2795; 2796; AVX512BW-FCP-LABEL: store_i8_stride5_vf32: 2797; AVX512BW-FCP: # %bb.0: 2798; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 2799; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 2800; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm0 2801; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 2802; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 2803; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] 2804; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm6 2805; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero 2806; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 2807; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 2808; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm8 2809; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 2810; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2811; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm9, %zmm5 2812; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] 2813; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] 2814; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero 2815; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 2816; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 2817; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2818; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 2819; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5] 2820; AVX512BW-FCP-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C 2821; AVX512BW-FCP-NEXT: kmovq %rax, %k1 2822; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} 2823; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] 2824; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] 2825; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 2826; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 2827; AVX512BW-FCP-NEXT: kmovq %rax, %k1 2828; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} 2829; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero 2830; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 2831; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 2832; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] 2833; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2834; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 2835; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 2836; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 2837; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] 2838; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7 2839; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 2840; AVX512BW-FCP-NEXT: kmovd %eax, %k1 2841; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 2842; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero 2843; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero 2844; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 2845; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2846; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 2847; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 2848; AVX512BW-FCP-NEXT: kmovq %rax, %k1 2849; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 2850; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] 2851; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 2852; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 2853; AVX512BW-FCP-NEXT: kmovq %rax, %k1 2854; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 2855; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30] 2856; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero 2857; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 2858; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 2859; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero 2860; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero 2861; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 2862; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2863; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C 2864; AVX512BW-FCP-NEXT: kmovd %eax, %k1 2865; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 2866; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] 2867; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 2868; AVX512BW-FCP-NEXT: movl $-2078209982, %eax # imm = 0x84210842 2869; AVX512BW-FCP-NEXT: kmovd %eax, %k1 2870; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 2871; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) 2872; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) 2873; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 2874; AVX512BW-FCP-NEXT: vzeroupper 2875; AVX512BW-FCP-NEXT: retq 2876; 2877; AVX512DQ-BW-LABEL: store_i8_stride5_vf32: 2878; AVX512DQ-BW: # %bb.0: 2879; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 2880; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm5 2881; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 2882; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm2 2883; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 2884; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 2885; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] 2886; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 2887; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero 2888; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm8, %xmm6 2889; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 2890; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 2891; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 2892; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2893; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 2894; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] 2895; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] 2896; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero 2897; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 2898; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 2899; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2900; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3 2901; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] 2902; AVX512DQ-BW-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C 2903; AVX512DQ-BW-NEXT: kmovq %rax, %k1 2904; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} 2905; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 2906; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] 2907; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm7, %zmm6 2908; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 2909; AVX512DQ-BW-NEXT: kmovq %rax, %k1 2910; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} 2911; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero 2912; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 2913; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm6 2914; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] 2915; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 2916; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 2917; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 2918; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 2919; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] 2920; AVX512DQ-BW-NEXT: vpermd %ymm4, %ymm7, %ymm7 2921; AVX512DQ-BW-NEXT: movl $138547332, %eax # imm = 0x8421084 2922; AVX512DQ-BW-NEXT: kmovd %eax, %k1 2923; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 2924; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero 2925; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero 2926; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 2927; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 2928; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 2929; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 2930; AVX512DQ-BW-NEXT: kmovq %rax, %k1 2931; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 2932; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,3,3,3,0,4,4,4] 2933; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm6, %ymm6 2934; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] 2935; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] 2936; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 2937; AVX512DQ-BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 2938; AVX512DQ-BW-NEXT: kmovq %rax, %k1 2939; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 2940; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] 2941; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 2942; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] 2943; AVX512DQ-BW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 2944; AVX512DQ-BW-NEXT: kmovd %eax, %k1 2945; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm5 {%k1} 2946; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3] 2947; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] 2948; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 2949; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] 2950; AVX512DQ-BW-NEXT: movl $693250386, %eax # imm = 0x29522952 2951; AVX512DQ-BW-NEXT: kmovd %eax, %k1 2952; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} 2953; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,3,3] 2954; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C 2955; AVX512DQ-BW-NEXT: kmovd %eax, %k1 2956; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} 2957; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] 2958; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 2959; AVX512DQ-BW-NEXT: movl $-2078209982, %eax # imm = 0x84210842 2960; AVX512DQ-BW-NEXT: kmovd %eax, %k1 2961; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} 2962; AVX512DQ-BW-NEXT: vmovdqa %ymm1, 128(%r9) 2963; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%r9) 2964; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) 2965; AVX512DQ-BW-NEXT: vzeroupper 2966; AVX512DQ-BW-NEXT: retq 2967; 2968; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf32: 2969; AVX512DQ-BW-FCP: # %bb.0: 2970; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 2971; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 2972; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm0 2973; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 2974; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 2975; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] 2976; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm6 2977; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero 2978; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 2979; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 2980; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm8 2981; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 2982; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 2983; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm9, %zmm5 2984; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] 2985; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] 2986; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero 2987; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 2988; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 2989; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 2990; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 2991; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5] 2992; AVX512DQ-BW-FCP-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C 2993; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 2994; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} 2995; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] 2996; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] 2997; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 2998; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 2999; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 3000; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} 3001; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero 3002; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero 3003; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 3004; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] 3005; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero 3006; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 3007; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 3008; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 3009; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] 3010; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7 3011; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 3012; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 3013; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] 3014; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero 3015; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero 3016; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 3017; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 3018; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 3019; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 3020; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 3021; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 3022; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] 3023; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 3024; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 3025; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 3026; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} 3027; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30] 3028; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero 3029; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 3030; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 3031; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero 3032; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero 3033; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 3034; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 3035; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C 3036; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 3037; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 3038; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] 3039; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 3040; AVX512DQ-BW-FCP-NEXT: movl $-2078209982, %eax # imm = 0x84210842 3041; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 3042; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 3043; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) 3044; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) 3045; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r9) 3046; AVX512DQ-BW-FCP-NEXT: vzeroupper 3047; AVX512DQ-BW-FCP-NEXT: retq 3048 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 3049 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64 3050 %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64 3051 %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64 3052 %in.vec4 = load <32 x i8>, ptr %in.vecptr4, align 64 3053 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 3054 %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 3055 %3 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 3056 %4 = shufflevector <32 x i8> %in.vec4, <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3057 %5 = shufflevector <128 x i8> %3, <128 x i8> %4, <160 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159> 3058 %interleaved.vec = shufflevector <160 x i8> %5, <160 x i8> poison, <160 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 1, i32 33, i32 65, i32 97, i32 129, i32 2, i32 34, i32 66, i32 98, i32 130, i32 3, i32 35, i32 67, i32 99, i32 131, i32 4, i32 36, i32 68, i32 100, i32 132, i32 5, i32 37, i32 69, i32 101, i32 133, i32 6, i32 38, i32 70, i32 102, i32 134, i32 7, i32 39, i32 71, i32 103, i32 135, i32 8, i32 40, i32 72, i32 104, i32 136, i32 9, i32 41, i32 73, i32 105, i32 137, i32 10, i32 42, i32 74, i32 106, i32 138, i32 11, i32 43, i32 75, i32 107, i32 139, i32 12, i32 44, i32 76, i32 108, i32 140, i32 13, i32 45, i32 77, i32 109, i32 141, i32 14, i32 46, i32 78, i32 110, i32 142, i32 15, i32 47, i32 79, i32 111, i32 143, i32 16, i32 48, i32 80, i32 112, i32 144, i32 17, i32 49, i32 81, i32 113, i32 145, i32 18, i32 50, i32 82, i32 114, i32 146, i32 19, i32 51, i32 83, i32 115, i32 147, i32 20, i32 52, i32 84, i32 116, i32 148, i32 21, i32 53, i32 85, i32 117, i32 149, i32 22, i32 54, i32 86, i32 118, i32 150, i32 23, i32 55, i32 87, i32 119, i32 151, i32 24, i32 56, i32 88, i32 120, i32 152, i32 25, i32 57, i32 89, i32 121, i32 153, i32 26, i32 58, i32 90, i32 122, i32 154, i32 27, i32 59, i32 91, i32 123, i32 155, i32 28, i32 60, i32 92, i32 124, i32 156, i32 29, i32 61, i32 93, i32 125, i32 157, i32 30, i32 62, i32 94, i32 126, i32 158, i32 31, i32 63, i32 95, i32 127, i32 159> 3059 store <160 x i8> %interleaved.vec, ptr %out.vec, align 64 3060 ret void 3061} 3062 3063define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { 3064; SSE-LABEL: store_i8_stride5_vf64: 3065; SSE: # %bb.0: 3066; SSE-NEXT: subq $504, %rsp # imm = 0x1F8 3067; SSE-NEXT: movdqa (%rdi), %xmm7 3068; SSE-NEXT: movdqa (%rsi), %xmm9 3069; SSE-NEXT: movdqa 16(%rsi), %xmm14 3070; SSE-NEXT: movdqa (%rdx), %xmm0 3071; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 3072; SSE-NEXT: movdqa 16(%rdx), %xmm11 3073; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3074; SSE-NEXT: movdqa (%rcx), %xmm10 3075; SSE-NEXT: movdqa 16(%rcx), %xmm6 3076; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3077; SSE-NEXT: movdqa (%r8), %xmm13 3078; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 3079; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3080; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 3081; SSE-NEXT: pand %xmm12, %xmm0 3082; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] 3083; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3084; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] 3085; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 3086; SSE-NEXT: movdqa %xmm12, %xmm4 3087; SSE-NEXT: pandn %xmm1, %xmm4 3088; SSE-NEXT: por %xmm0, %xmm4 3089; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 3090; SSE-NEXT: pand %xmm8, %xmm4 3091; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] 3092; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 3093; SSE-NEXT: pand %xmm2, %xmm0 3094; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] 3095; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 3096; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3097; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] 3098; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] 3099; SSE-NEXT: movdqa %xmm2, %xmm5 3100; SSE-NEXT: pandn %xmm1, %xmm5 3101; SSE-NEXT: por %xmm0, %xmm5 3102; SSE-NEXT: movdqa %xmm8, %xmm0 3103; SSE-NEXT: pandn %xmm5, %xmm0 3104; SSE-NEXT: por %xmm4, %xmm0 3105; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 3106; SSE-NEXT: pand %xmm15, %xmm0 3107; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] 3108; SSE-NEXT: movdqa %xmm15, %xmm3 3109; SSE-NEXT: pandn %xmm1, %xmm3 3110; SSE-NEXT: por %xmm0, %xmm3 3111; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3112; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] 3113; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3114; SSE-NEXT: pand %xmm12, %xmm0 3115; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] 3116; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3117; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] 3118; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] 3119; SSE-NEXT: movdqa %xmm12, %xmm5 3120; SSE-NEXT: pandn %xmm1, %xmm5 3121; SSE-NEXT: por %xmm0, %xmm5 3122; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] 3123; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 3124; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3125; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] 3126; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] 3127; SSE-NEXT: movdqa %xmm2, %xmm1 3128; SSE-NEXT: pandn %xmm0, %xmm1 3129; SSE-NEXT: movdqa 16(%rdi), %xmm0 3130; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3131; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 3132; SSE-NEXT: pand %xmm2, %xmm0 3133; SSE-NEXT: por %xmm0, %xmm1 3134; SSE-NEXT: movdqa %xmm8, %xmm0 3135; SSE-NEXT: pandn %xmm1, %xmm0 3136; SSE-NEXT: pand %xmm8, %xmm5 3137; SSE-NEXT: por %xmm5, %xmm0 3138; SSE-NEXT: movdqa 16(%r8), %xmm1 3139; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3140; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 3141; SSE-NEXT: movdqa %xmm15, %xmm3 3142; SSE-NEXT: pandn %xmm1, %xmm3 3143; SSE-NEXT: pand %xmm15, %xmm0 3144; SSE-NEXT: por %xmm0, %xmm3 3145; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3146; SSE-NEXT: movdqa 32(%rcx), %xmm0 3147; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3148; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 3149; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3150; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] 3151; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3152; SSE-NEXT: movdqa %xmm12, %xmm1 3153; SSE-NEXT: pandn %xmm0, %xmm1 3154; SSE-NEXT: movdqa 32(%rdx), %xmm0 3155; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3156; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 3157; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3158; SSE-NEXT: pand %xmm12, %xmm0 3159; SSE-NEXT: por %xmm0, %xmm1 3160; SSE-NEXT: movdqa 32(%rsi), %xmm11 3161; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,2,1] 3162; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 3163; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3164; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] 3165; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] 3166; SSE-NEXT: movdqa %xmm2, %xmm5 3167; SSE-NEXT: pandn %xmm0, %xmm5 3168; SSE-NEXT: movdqa 32(%rdi), %xmm0 3169; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3170; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 3171; SSE-NEXT: pand %xmm2, %xmm0 3172; SSE-NEXT: por %xmm0, %xmm5 3173; SSE-NEXT: movdqa %xmm8, %xmm0 3174; SSE-NEXT: pandn %xmm5, %xmm0 3175; SSE-NEXT: pand %xmm8, %xmm1 3176; SSE-NEXT: por %xmm1, %xmm0 3177; SSE-NEXT: movdqa 32(%r8), %xmm1 3178; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3179; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 3180; SSE-NEXT: movdqa %xmm15, %xmm3 3181; SSE-NEXT: pandn %xmm1, %xmm3 3182; SSE-NEXT: pand %xmm15, %xmm0 3183; SSE-NEXT: por %xmm0, %xmm3 3184; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3185; SSE-NEXT: movdqa 48(%rcx), %xmm0 3186; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3187; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 3188; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3189; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] 3190; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3191; SSE-NEXT: movdqa %xmm12, %xmm1 3192; SSE-NEXT: pandn %xmm0, %xmm1 3193; SSE-NEXT: movdqa 48(%rdx), %xmm0 3194; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3195; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] 3196; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 3197; SSE-NEXT: pand %xmm12, %xmm0 3198; SSE-NEXT: por %xmm0, %xmm1 3199; SSE-NEXT: movdqa 48(%rsi), %xmm0 3200; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3201; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 3202; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 3203; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3204; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] 3205; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] 3206; SSE-NEXT: movdqa %xmm2, %xmm5 3207; SSE-NEXT: pandn %xmm0, %xmm5 3208; SSE-NEXT: movdqa 48(%rdi), %xmm0 3209; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3210; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] 3211; SSE-NEXT: pand %xmm2, %xmm0 3212; SSE-NEXT: por %xmm0, %xmm5 3213; SSE-NEXT: movdqa %xmm8, %xmm0 3214; SSE-NEXT: pandn %xmm5, %xmm0 3215; SSE-NEXT: pand %xmm8, %xmm1 3216; SSE-NEXT: por %xmm1, %xmm0 3217; SSE-NEXT: movdqa 48(%r8), %xmm1 3218; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3219; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 3220; SSE-NEXT: movdqa %xmm15, %xmm5 3221; SSE-NEXT: pandn %xmm1, %xmm5 3222; SSE-NEXT: pand %xmm15, %xmm0 3223; SSE-NEXT: por %xmm0, %xmm5 3224; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3225; SSE-NEXT: movdqa %xmm10, %xmm0 3226; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 3227; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3228; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 3229; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] 3230; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 3231; SSE-NEXT: movdqa %xmm15, %xmm1 3232; SSE-NEXT: pandn %xmm0, %xmm1 3233; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload 3234; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,2,2,3,4,5,6,7] 3235; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3236; SSE-NEXT: pand %xmm15, %xmm0 3237; SSE-NEXT: por %xmm0, %xmm1 3238; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 3239; SSE-NEXT: movdqa %xmm6, %xmm0 3240; SSE-NEXT: pandn %xmm1, %xmm0 3241; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3242; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] 3243; SSE-NEXT: movdqa %xmm2, %xmm5 3244; SSE-NEXT: pandn %xmm1, %xmm5 3245; SSE-NEXT: movdqa %xmm9, %xmm1 3246; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 3247; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3248; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 3249; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] 3250; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] 3251; SSE-NEXT: pand %xmm2, %xmm1 3252; SSE-NEXT: por %xmm5, %xmm1 3253; SSE-NEXT: pand %xmm6, %xmm1 3254; SSE-NEXT: por %xmm0, %xmm1 3255; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 3256; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3257; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] 3258; SSE-NEXT: movdqa %xmm4, %xmm0 3259; SSE-NEXT: pandn %xmm5, %xmm0 3260; SSE-NEXT: pand %xmm4, %xmm1 3261; SSE-NEXT: por %xmm1, %xmm0 3262; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3263; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3264; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3265; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] 3266; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] 3267; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] 3268; SSE-NEXT: movdqa %xmm4, %xmm5 3269; SSE-NEXT: pandn %xmm1, %xmm5 3270; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] 3271; SSE-NEXT: pand %xmm4, %xmm1 3272; SSE-NEXT: por %xmm1, %xmm5 3273; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 3274; SSE-NEXT: movdqa %xmm8, %xmm7 3275; SSE-NEXT: pandn %xmm5, %xmm7 3276; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7] 3277; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] 3278; SSE-NEXT: movdqa %xmm2, %xmm9 3279; SSE-NEXT: pandn %xmm5, %xmm9 3280; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3281; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3282; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,1,2,1] 3283; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,2,3,4,5,6,7] 3284; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] 3285; SSE-NEXT: pand %xmm2, %xmm5 3286; SSE-NEXT: por %xmm9, %xmm5 3287; SSE-NEXT: pand %xmm8, %xmm5 3288; SSE-NEXT: por %xmm7, %xmm5 3289; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] 3290; SSE-NEXT: movdqa %xmm12, %xmm0 3291; SSE-NEXT: pandn %xmm7, %xmm0 3292; SSE-NEXT: pand %xmm12, %xmm5 3293; SSE-NEXT: por %xmm5, %xmm0 3294; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3295; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3296; SSE-NEXT: movdqa %xmm13, %xmm0 3297; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] 3298; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3299; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] 3300; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] 3301; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 3302; SSE-NEXT: movdqa %xmm15, %xmm7 3303; SSE-NEXT: pandn %xmm5, %xmm7 3304; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3305; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] 3306; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] 3307; SSE-NEXT: pand %xmm15, %xmm5 3308; SSE-NEXT: por %xmm5, %xmm7 3309; SSE-NEXT: movdqa %xmm6, %xmm5 3310; SSE-NEXT: pandn %xmm7, %xmm5 3311; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3312; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 3313; SSE-NEXT: movdqa %xmm2, %xmm9 3314; SSE-NEXT: pandn %xmm7, %xmm9 3315; SSE-NEXT: movdqa %xmm14, %xmm1 3316; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 3317; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3318; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] 3319; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] 3320; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] 3321; SSE-NEXT: pand %xmm2, %xmm7 3322; SSE-NEXT: por %xmm9, %xmm7 3323; SSE-NEXT: pand %xmm6, %xmm7 3324; SSE-NEXT: por %xmm5, %xmm7 3325; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3326; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] 3327; SSE-NEXT: movdqa %xmm4, %xmm3 3328; SSE-NEXT: pandn %xmm5, %xmm3 3329; SSE-NEXT: pand %xmm4, %xmm7 3330; SSE-NEXT: por %xmm7, %xmm3 3331; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3332; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3333; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3334; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,2,1] 3335; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 3336; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] 3337; SSE-NEXT: movdqa %xmm4, %xmm7 3338; SSE-NEXT: pandn %xmm5, %xmm7 3339; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] 3340; SSE-NEXT: pand %xmm4, %xmm5 3341; SSE-NEXT: por %xmm5, %xmm7 3342; SSE-NEXT: movdqa %xmm8, %xmm5 3343; SSE-NEXT: pandn %xmm7, %xmm5 3344; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] 3345; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] 3346; SSE-NEXT: movdqa %xmm2, %xmm9 3347; SSE-NEXT: pandn %xmm7, %xmm9 3348; SSE-NEXT: movdqa %xmm13, %xmm0 3349; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 3350; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3351; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,1] 3352; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] 3353; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] 3354; SSE-NEXT: pand %xmm2, %xmm7 3355; SSE-NEXT: por %xmm9, %xmm7 3356; SSE-NEXT: pand %xmm8, %xmm7 3357; SSE-NEXT: por %xmm5, %xmm7 3358; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] 3359; SSE-NEXT: movdqa %xmm12, %xmm0 3360; SSE-NEXT: pandn %xmm5, %xmm0 3361; SSE-NEXT: pand %xmm12, %xmm7 3362; SSE-NEXT: por %xmm7, %xmm0 3363; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3364; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3365; SSE-NEXT: movdqa %xmm3, %xmm0 3366; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 3367; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3368; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] 3369; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] 3370; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 3371; SSE-NEXT: movdqa %xmm15, %xmm7 3372; SSE-NEXT: pandn %xmm5, %xmm7 3373; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3374; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] 3375; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] 3376; SSE-NEXT: pand %xmm15, %xmm5 3377; SSE-NEXT: por %xmm5, %xmm7 3378; SSE-NEXT: movdqa %xmm6, %xmm5 3379; SSE-NEXT: pandn %xmm7, %xmm5 3380; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3381; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] 3382; SSE-NEXT: movdqa %xmm2, %xmm9 3383; SSE-NEXT: pandn %xmm7, %xmm9 3384; SSE-NEXT: movdqa %xmm11, %xmm14 3385; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 3386; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,1,2,3] 3387; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] 3388; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] 3389; SSE-NEXT: pand %xmm2, %xmm7 3390; SSE-NEXT: por %xmm9, %xmm7 3391; SSE-NEXT: pand %xmm6, %xmm7 3392; SSE-NEXT: por %xmm5, %xmm7 3393; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3394; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] 3395; SSE-NEXT: movdqa %xmm4, %xmm1 3396; SSE-NEXT: pandn %xmm5, %xmm1 3397; SSE-NEXT: pand %xmm4, %xmm7 3398; SSE-NEXT: por %xmm7, %xmm1 3399; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3400; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3401; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3402; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] 3403; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 3404; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] 3405; SSE-NEXT: movdqa %xmm4, %xmm7 3406; SSE-NEXT: pandn %xmm5, %xmm7 3407; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] 3408; SSE-NEXT: pand %xmm4, %xmm5 3409; SSE-NEXT: por %xmm5, %xmm7 3410; SSE-NEXT: movdqa %xmm8, %xmm5 3411; SSE-NEXT: pandn %xmm7, %xmm5 3412; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] 3413; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] 3414; SSE-NEXT: movdqa %xmm2, %xmm9 3415; SSE-NEXT: pandn %xmm7, %xmm9 3416; SSE-NEXT: movdqa %xmm3, %xmm1 3417; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 3418; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3419; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] 3420; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] 3421; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] 3422; SSE-NEXT: pand %xmm2, %xmm7 3423; SSE-NEXT: por %xmm9, %xmm7 3424; SSE-NEXT: pand %xmm8, %xmm7 3425; SSE-NEXT: por %xmm5, %xmm7 3426; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] 3427; SSE-NEXT: movdqa %xmm12, %xmm0 3428; SSE-NEXT: pandn %xmm5, %xmm0 3429; SSE-NEXT: pand %xmm12, %xmm7 3430; SSE-NEXT: por %xmm7, %xmm0 3431; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3432; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3433; SSE-NEXT: movdqa %xmm0, %xmm1 3434; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3435; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3436; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] 3437; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] 3438; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] 3439; SSE-NEXT: movdqa %xmm15, %xmm7 3440; SSE-NEXT: pandn %xmm5, %xmm7 3441; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3442; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] 3443; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] 3444; SSE-NEXT: pand %xmm15, %xmm5 3445; SSE-NEXT: por %xmm5, %xmm7 3446; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3447; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] 3448; SSE-NEXT: movdqa %xmm2, %xmm9 3449; SSE-NEXT: pandn %xmm5, %xmm9 3450; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3451; SSE-NEXT: movdqa %xmm3, %xmm1 3452; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 3453; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3454; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] 3455; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7] 3456; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] 3457; SSE-NEXT: pand %xmm2, %xmm5 3458; SSE-NEXT: por %xmm9, %xmm5 3459; SSE-NEXT: pand %xmm6, %xmm5 3460; SSE-NEXT: pandn %xmm7, %xmm6 3461; SSE-NEXT: por %xmm5, %xmm6 3462; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3463; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] 3464; SSE-NEXT: movdqa %xmm4, %xmm1 3465; SSE-NEXT: pandn %xmm5, %xmm1 3466; SSE-NEXT: pand %xmm4, %xmm6 3467; SSE-NEXT: por %xmm6, %xmm1 3468; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3469; SSE-NEXT: movdqa %xmm3, %xmm1 3470; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 3471; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3472; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,1] 3473; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] 3474; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] 3475; SSE-NEXT: movdqa %xmm4, %xmm6 3476; SSE-NEXT: pandn %xmm5, %xmm6 3477; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] 3478; SSE-NEXT: pand %xmm4, %xmm5 3479; SSE-NEXT: por %xmm5, %xmm6 3480; SSE-NEXT: movdqa %xmm8, %xmm5 3481; SSE-NEXT: pandn %xmm6, %xmm5 3482; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,5,6,6,7] 3483; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 3484; SSE-NEXT: movdqa %xmm2, %xmm7 3485; SSE-NEXT: pandn %xmm6, %xmm7 3486; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3487; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3488; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] 3489; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,2,3,4,5,6,7] 3490; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,4] 3491; SSE-NEXT: pand %xmm2, %xmm6 3492; SSE-NEXT: por %xmm7, %xmm6 3493; SSE-NEXT: pand %xmm8, %xmm6 3494; SSE-NEXT: por %xmm5, %xmm6 3495; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,2,2] 3496; SSE-NEXT: movdqa %xmm12, %xmm0 3497; SSE-NEXT: pandn %xmm5, %xmm0 3498; SSE-NEXT: pand %xmm12, %xmm6 3499; SSE-NEXT: por %xmm6, %xmm0 3500; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3501; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3502; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] 3503; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] 3504; SSE-NEXT: movdqa %xmm4, %xmm6 3505; SSE-NEXT: pandn %xmm5, %xmm6 3506; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload 3507; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,0,2,3,4,5,6,7] 3508; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] 3509; SSE-NEXT: pand %xmm4, %xmm5 3510; SSE-NEXT: por %xmm5, %xmm6 3511; SSE-NEXT: movdqa %xmm8, %xmm5 3512; SSE-NEXT: pandn %xmm6, %xmm5 3513; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 3514; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] 3515; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] 3516; SSE-NEXT: movdqa %xmm12, %xmm7 3517; SSE-NEXT: pandn %xmm6, %xmm7 3518; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3519; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,0,0] 3520; SSE-NEXT: pand %xmm12, %xmm6 3521; SSE-NEXT: por %xmm6, %xmm7 3522; SSE-NEXT: pand %xmm8, %xmm7 3523; SSE-NEXT: por %xmm5, %xmm7 3524; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3525; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] 3526; SSE-NEXT: movdqa %xmm2, %xmm1 3527; SSE-NEXT: pandn %xmm5, %xmm1 3528; SSE-NEXT: pand %xmm2, %xmm7 3529; SSE-NEXT: por %xmm7, %xmm1 3530; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3531; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3532; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] 3533; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] 3534; SSE-NEXT: movdqa %xmm2, %xmm6 3535; SSE-NEXT: pandn %xmm5, %xmm6 3536; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,6,6,7] 3537; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] 3538; SSE-NEXT: pand %xmm2, %xmm5 3539; SSE-NEXT: por %xmm5, %xmm6 3540; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 3541; SSE-NEXT: movdqa %xmm1, %xmm5 3542; SSE-NEXT: pandn %xmm6, %xmm5 3543; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 3544; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] 3545; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] 3546; SSE-NEXT: movdqa %xmm15, %xmm7 3547; SSE-NEXT: pandn %xmm6, %xmm7 3548; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] 3549; SSE-NEXT: pand %xmm15, %xmm6 3550; SSE-NEXT: por %xmm6, %xmm7 3551; SSE-NEXT: pand %xmm1, %xmm7 3552; SSE-NEXT: por %xmm5, %xmm7 3553; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] 3554; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 3555; SSE-NEXT: movdqa %xmm9, %xmm13 3556; SSE-NEXT: pandn %xmm5, %xmm13 3557; SSE-NEXT: pand %xmm9, %xmm7 3558; SSE-NEXT: por %xmm7, %xmm13 3559; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3560; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] 3561; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] 3562; SSE-NEXT: movdqa %xmm4, %xmm6 3563; SSE-NEXT: pandn %xmm5, %xmm6 3564; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3565; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,3,4,5,6,7] 3566; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] 3567; SSE-NEXT: pand %xmm4, %xmm5 3568; SSE-NEXT: por %xmm5, %xmm6 3569; SSE-NEXT: movdqa %xmm8, %xmm5 3570; SSE-NEXT: pandn %xmm6, %xmm5 3571; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 3572; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] 3573; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] 3574; SSE-NEXT: movdqa %xmm12, %xmm3 3575; SSE-NEXT: pandn %xmm6, %xmm3 3576; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3577; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] 3578; SSE-NEXT: pand %xmm12, %xmm6 3579; SSE-NEXT: por %xmm6, %xmm3 3580; SSE-NEXT: pand %xmm8, %xmm3 3581; SSE-NEXT: por %xmm5, %xmm3 3582; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3583; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] 3584; SSE-NEXT: movdqa %xmm2, %xmm10 3585; SSE-NEXT: pandn %xmm5, %xmm10 3586; SSE-NEXT: pand %xmm2, %xmm3 3587; SSE-NEXT: por %xmm3, %xmm10 3588; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3589; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] 3590; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] 3591; SSE-NEXT: movdqa %xmm2, %xmm5 3592; SSE-NEXT: pandn %xmm3, %xmm5 3593; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,6,6,7] 3594; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] 3595; SSE-NEXT: pand %xmm2, %xmm3 3596; SSE-NEXT: por %xmm3, %xmm5 3597; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 3598; SSE-NEXT: movdqa %xmm7, %xmm3 3599; SSE-NEXT: pandn %xmm5, %xmm3 3600; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3601; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] 3602; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] 3603; SSE-NEXT: movdqa %xmm15, %xmm6 3604; SSE-NEXT: pandn %xmm5, %xmm6 3605; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] 3606; SSE-NEXT: pand %xmm15, %xmm5 3607; SSE-NEXT: por %xmm5, %xmm6 3608; SSE-NEXT: pand %xmm7, %xmm6 3609; SSE-NEXT: por %xmm3, %xmm6 3610; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 3611; SSE-NEXT: movdqa %xmm9, %xmm7 3612; SSE-NEXT: pandn %xmm3, %xmm7 3613; SSE-NEXT: pand %xmm9, %xmm6 3614; SSE-NEXT: por %xmm6, %xmm7 3615; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3616; SSE-NEXT: # xmm3 = mem[1,0,2,3,4,5,6,7] 3617; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] 3618; SSE-NEXT: movdqa %xmm4, %xmm6 3619; SSE-NEXT: pandn %xmm3, %xmm6 3620; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3621; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,0,2,3,4,5,6,7] 3622; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 3623; SSE-NEXT: pand %xmm4, %xmm3 3624; SSE-NEXT: por %xmm3, %xmm6 3625; SSE-NEXT: movdqa %xmm8, %xmm3 3626; SSE-NEXT: pandn %xmm6, %xmm3 3627; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm14[0,1,2,2,4,5,6,7] 3628; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] 3629; SSE-NEXT: movdqa %xmm12, %xmm11 3630; SSE-NEXT: pandn %xmm6, %xmm11 3631; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3632; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] 3633; SSE-NEXT: pand %xmm12, %xmm6 3634; SSE-NEXT: por %xmm6, %xmm11 3635; SSE-NEXT: pand %xmm8, %xmm11 3636; SSE-NEXT: por %xmm3, %xmm11 3637; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3638; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] 3639; SSE-NEXT: movdqa %xmm2, %xmm14 3640; SSE-NEXT: pandn %xmm3, %xmm14 3641; SSE-NEXT: pand %xmm2, %xmm11 3642; SSE-NEXT: por %xmm11, %xmm14 3643; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3644; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] 3645; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] 3646; SSE-NEXT: movdqa %xmm2, %xmm6 3647; SSE-NEXT: pandn %xmm3, %xmm6 3648; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,6,6,7] 3649; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] 3650; SSE-NEXT: pand %xmm2, %xmm3 3651; SSE-NEXT: por %xmm3, %xmm6 3652; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 3653; SSE-NEXT: movdqa %xmm5, %xmm3 3654; SSE-NEXT: pandn %xmm6, %xmm3 3655; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 3656; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] 3657; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] 3658; SSE-NEXT: movdqa %xmm15, %xmm11 3659; SSE-NEXT: pandn %xmm6, %xmm11 3660; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] 3661; SSE-NEXT: pand %xmm15, %xmm6 3662; SSE-NEXT: por %xmm6, %xmm11 3663; SSE-NEXT: pand %xmm5, %xmm11 3664; SSE-NEXT: por %xmm3, %xmm11 3665; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] 3666; SSE-NEXT: movdqa %xmm9, %xmm6 3667; SSE-NEXT: pandn %xmm3, %xmm6 3668; SSE-NEXT: pand %xmm9, %xmm11 3669; SSE-NEXT: por %xmm11, %xmm6 3670; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3671; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,3,4,5,6,7] 3672; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 3673; SSE-NEXT: pand %xmm4, %xmm3 3674; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3675; SSE-NEXT: # xmm11 = mem[1,0,2,3,4,5,6,7] 3676; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,1] 3677; SSE-NEXT: pandn %xmm11, %xmm4 3678; SSE-NEXT: por %xmm3, %xmm4 3679; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3680; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,0,0] 3681; SSE-NEXT: pand %xmm12, %xmm3 3682; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3683; SSE-NEXT: # xmm11 = mem[0,1,2,2,4,5,6,7] 3684; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3] 3685; SSE-NEXT: pandn %xmm11, %xmm12 3686; SSE-NEXT: por %xmm3, %xmm12 3687; SSE-NEXT: pand %xmm8, %xmm12 3688; SSE-NEXT: pandn %xmm4, %xmm8 3689; SSE-NEXT: por %xmm12, %xmm8 3690; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3691; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] 3692; SSE-NEXT: movdqa %xmm2, %xmm0 3693; SSE-NEXT: pandn %xmm3, %xmm0 3694; SSE-NEXT: pand %xmm2, %xmm8 3695; SSE-NEXT: por %xmm8, %xmm0 3696; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] 3697; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 3698; SSE-NEXT: pand %xmm2, %xmm1 3699; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3700; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] 3701; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] 3702; SSE-NEXT: pandn %xmm3, %xmm2 3703; SSE-NEXT: por %xmm1, %xmm2 3704; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] 3705; SSE-NEXT: pand %xmm15, %xmm1 3706; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3707; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] 3708; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] 3709; SSE-NEXT: pandn %xmm3, %xmm15 3710; SSE-NEXT: por %xmm1, %xmm15 3711; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] 3712; SSE-NEXT: pand %xmm1, %xmm15 3713; SSE-NEXT: pandn %xmm2, %xmm1 3714; SSE-NEXT: por %xmm15, %xmm1 3715; SSE-NEXT: pand %xmm9, %xmm1 3716; SSE-NEXT: movdqa %xmm1, %xmm2 3717; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] 3718; SSE-NEXT: pandn %xmm1, %xmm9 3719; SSE-NEXT: por %xmm2, %xmm9 3720; SSE-NEXT: movdqa %xmm9, 304(%r9) 3721; SSE-NEXT: movdqa %xmm0, 240(%r9) 3722; SSE-NEXT: movdqa %xmm6, 224(%r9) 3723; SSE-NEXT: movdqa %xmm14, 160(%r9) 3724; SSE-NEXT: movdqa %xmm7, 144(%r9) 3725; SSE-NEXT: movdqa %xmm10, 80(%r9) 3726; SSE-NEXT: movdqa %xmm13, 64(%r9) 3727; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3728; SSE-NEXT: movaps %xmm0, (%r9) 3729; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3730; SSE-NEXT: movaps %xmm0, 288(%r9) 3731; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3732; SSE-NEXT: movaps %xmm0, 256(%r9) 3733; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3734; SSE-NEXT: movaps %xmm0, 208(%r9) 3735; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3736; SSE-NEXT: movaps %xmm0, 176(%r9) 3737; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3738; SSE-NEXT: movaps %xmm0, 128(%r9) 3739; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3740; SSE-NEXT: movaps %xmm0, 96(%r9) 3741; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3742; SSE-NEXT: movaps %xmm0, 48(%r9) 3743; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3744; SSE-NEXT: movaps %xmm0, 16(%r9) 3745; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3746; SSE-NEXT: movaps %xmm0, 272(%r9) 3747; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3748; SSE-NEXT: movaps %xmm0, 192(%r9) 3749; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3750; SSE-NEXT: movaps %xmm0, 112(%r9) 3751; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3752; SSE-NEXT: movaps %xmm0, 32(%r9) 3753; SSE-NEXT: addq $504, %rsp # imm = 0x1F8 3754; SSE-NEXT: retq 3755; 3756; AVX-LABEL: store_i8_stride5_vf64: 3757; AVX: # %bb.0: 3758; AVX-NEXT: subq $104, %rsp 3759; AVX-NEXT: vmovdqa 48(%rcx), %xmm0 3760; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] 3761; AVX-NEXT: # xmm14 = mem[0,0] 3762; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm2 3763; AVX-NEXT: vmovdqa 48(%rdx), %xmm1 3764; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] 3765; AVX-NEXT: vpor %xmm2, %xmm3, %xmm3 3766; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3767; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] 3768; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 3769; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 3770; AVX-NEXT: vandnps %ymm3, %ymm2, %ymm4 3771; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 3772; AVX-NEXT: vmovddup {{.*#+}} xmm10 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] 3773; AVX-NEXT: # xmm10 = mem[0,0] 3774; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm6 3775; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 3776; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] 3777; AVX-NEXT: # xmm12 = mem[0,0] 3778; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm7 3779; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 3780; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 3781; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] 3782; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 3783; AVX-NEXT: vandps %ymm2, %ymm6, %ymm6 3784; AVX-NEXT: vorps %ymm4, %ymm6, %ymm4 3785; AVX-NEXT: vextractf128 $1, %ymm4, %xmm6 3786; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,xmm6[3,4,5,6],zero,xmm6[8,9,10,11],zero,xmm6[13,14,15] 3787; AVX-NEXT: vmovdqa 48(%r8), %xmm7 3788; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm7[6],zero,zero,zero,zero,xmm7[7],zero,zero,zero,zero,xmm7[8],zero,zero,zero 3789; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 3790; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3791; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] 3792; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm7[3],zero,zero,zero,zero,xmm7[4],zero,zero,zero,zero,xmm7[5],zero,zero 3793; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4 3794; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3795; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 3796; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] 3797; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] 3798; AVX-NEXT: # xmm8 = mem[0,0] 3799; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm4 3800; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 3801; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 3802; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 3803; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] 3804; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] 3805; AVX-NEXT: # xmm15 = mem[0,0] 3806; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 3807; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3808; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 3809; AVX-NEXT: vandnps %ymm4, %ymm11, %ymm1 3810; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 3811; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 3812; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 3813; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero 3814; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15] 3815; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1 3816; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3817; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] 3818; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm0 3819; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] 3820; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm1 3821; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 3822; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3823; AVX-NEXT: vmovdqa 32(%rsi), %xmm0 3824; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 3825; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm1 3826; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm6 3827; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 3828; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 3829; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm6 3830; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm6 3831; AVX-NEXT: vmovdqa 32(%rcx), %xmm1 3832; AVX-NEXT: vmovdqa 32(%rdx), %xmm8 3833; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm10 3834; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] 3835; AVX-NEXT: vpor %xmm10, %xmm14, %xmm10 3836; AVX-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 3837; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14 3838; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm14 3839; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 3840; AVX-NEXT: vandnps %ymm6, %ymm10, %ymm6 3841; AVX-NEXT: vandps %ymm10, %ymm14, %ymm14 3842; AVX-NEXT: vorps %ymm6, %ymm14, %ymm14 3843; AVX-NEXT: vextractf128 $1, %ymm14, %xmm6 3844; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm15 3845; AVX-NEXT: vmovdqa 32(%r8), %xmm6 3846; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm13 3847; AVX-NEXT: vpor %xmm13, %xmm15, %xmm2 3848; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3849; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15] 3850; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[6],zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,xmm6[8],zero,zero,zero 3851; AVX-NEXT: vpor %xmm14, %xmm13, %xmm2 3852; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 3853; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] 3854; AVX-NEXT: # xmm14 = mem[0,0] 3855; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm2 3856; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] 3857; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] 3858; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 3859; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 3860; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] 3861; AVX-NEXT: # xmm13 = mem[0,0] 3862; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm3 3863; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] 3864; AVX-NEXT: # xmm15 = mem[0,0] 3865; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm5 3866; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 3867; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2 3868; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3 3869; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 3870; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 3871; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] 3872; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 3873; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] 3874; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5 3875; AVX-NEXT: vmovdqa %xmm10, %xmm7 3876; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 3877; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3878; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] 3879; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2 3880; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] 3881; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm3 3882; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 3883; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3884; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 3885; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] 3886; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm1 3887; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3888; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 3889; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 3890; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] 3891; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm0 3892; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3893; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1 3894; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 3895; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 3896; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 3897; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] 3898; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3],zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,xmm6[5],zero,zero 3899; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 3900; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3901; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm0 3902; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm1 3903; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 3904; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3905; AVX-NEXT: vmovdqa 16(%rsi), %xmm8 3906; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 3907; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] 3908; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm1 3909; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] 3910; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3911; AVX-NEXT: vmovdqa 16(%rcx), %xmm1 3912; AVX-NEXT: vmovdqa 16(%rdx), %xmm2 3913; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 3914; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] 3915; AVX-NEXT: # xmm12 = mem[0,0] 3916; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 3917; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 3918; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] 3919; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 3920; AVX-NEXT: vandnps %ymm0, %ymm11, %ymm0 3921; AVX-NEXT: vandps %ymm3, %ymm11, %ymm3 3922; AVX-NEXT: vorps %ymm0, %ymm3, %ymm3 3923; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0 3924; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm4 3925; AVX-NEXT: vmovdqa %xmm5, %xmm11 3926; AVX-NEXT: vmovdqa 16(%r8), %xmm0 3927; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm7 3928; AVX-NEXT: vpor %xmm7, %xmm4, %xmm4 3929; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3930; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5],zero,xmm3[7,8,9,10],zero,xmm3[12,13,14,15] 3931; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero 3932; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 3933; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3934; AVX-NEXT: vmovdqa (%rcx), %xmm9 3935; AVX-NEXT: vmovdqa (%rdx), %xmm7 3936; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] 3937; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 3938; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 3939; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm5 3940; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm12 3941; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] 3942; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm13 3943; AVX-NEXT: vmovdqa (%rsi), %xmm5 3944; AVX-NEXT: vmovdqa (%rdi), %xmm3 3945; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 3946; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm14 3947; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 3948; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 3949; AVX-NEXT: vandnps %ymm12, %ymm14, %ymm12 3950; AVX-NEXT: vandps %ymm14, %ymm13, %ymm13 3951; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12 3952; AVX-NEXT: vextractf128 $1, %ymm12, %xmm13 3953; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3],zero,xmm13[5,6,7,8],zero,xmm13[10,11,12,13],zero,xmm13[15] 3954; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero 3955; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 3956; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3957; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12 3958; AVX-NEXT: vmovdqa (%r8), %xmm13 3959; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[12],zero,zero,zero,zero,xmm13[13],zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,xmm13[15] 3960; AVX-NEXT: vpor %xmm15, %xmm12, %xmm11 3961; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3962; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] 3963; AVX-NEXT: # xmm11 = mem[0,0] 3964; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 3965; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] 3966; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 3967; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] 3968; AVX-NEXT: # xmm11 = mem[0,0] 3969; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm2 3970; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 3971; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] 3972; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] 3973; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2 3974; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 3975; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] 3976; AVX-NEXT: # xmm6 = mem[0,0] 3977; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4 3978; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 3979; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 3980; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 3981; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 3982; AVX-NEXT: vorps %ymm1, %ymm2, %ymm2 3983; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 3984; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] 3985; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm1 3986; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] 3987; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm4 3988; AVX-NEXT: vpor %xmm4, %xmm1, %xmm12 3989; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] 3990; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 3991; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] 3992; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 3993; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 3994; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 3995; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm4 3996; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] 3997; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 3998; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 3999; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4 4000; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 4001; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] 4002; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 4003; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 4004; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 4005; AVX-NEXT: vandps %ymm6, %ymm4, %ymm4 4006; AVX-NEXT: vorps %ymm2, %ymm4, %ymm4 4007; AVX-NEXT: vextractf128 $1, %ymm4, %xmm2 4008; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 4009; AVX-NEXT: vpshufb %xmm8, %xmm13, %xmm6 4010; AVX-NEXT: vpor %xmm6, %xmm2, %xmm2 4011; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] 4012; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero,xmm13[2],zero 4013; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4 4014; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] 4015; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] 4016; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 4017; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] 4018; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 4019; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero 4020; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9] 4021; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 4022; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] 4023; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] 4024; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 4025; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] 4026; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 4027; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5 4028; AVX-NEXT: vorps %ymm3, %ymm5, %ymm3 4029; AVX-NEXT: vextractf128 $1, %ymm3, %xmm5 4030; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] 4031; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm13[9],zero,zero,zero,zero,xmm13[10],zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero 4032; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 4033; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm3 4034; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm6 4035; AVX-NEXT: vpor %xmm6, %xmm3, %xmm3 4036; AVX-NEXT: vmovdqa %xmm3, 32(%r9) 4037; AVX-NEXT: vmovdqa %xmm5, 48(%r9) 4038; AVX-NEXT: vmovdqa %xmm4, (%r9) 4039; AVX-NEXT: vmovdqa %xmm2, 16(%r9) 4040; AVX-NEXT: vmovdqa %xmm0, 96(%r9) 4041; AVX-NEXT: vmovdqa %xmm12, 112(%r9) 4042; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4043; AVX-NEXT: vmovaps %xmm0, 64(%r9) 4044; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4045; AVX-NEXT: vmovaps %xmm0, 80(%r9) 4046; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4047; AVX-NEXT: vmovaps %xmm0, 128(%r9) 4048; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4049; AVX-NEXT: vmovaps %xmm0, 144(%r9) 4050; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4051; AVX-NEXT: vmovaps %xmm0, 160(%r9) 4052; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4053; AVX-NEXT: vmovaps %xmm0, 176(%r9) 4054; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4055; AVX-NEXT: vmovaps %xmm0, 224(%r9) 4056; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4057; AVX-NEXT: vmovaps %xmm0, 240(%r9) 4058; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4059; AVX-NEXT: vmovaps %xmm0, 192(%r9) 4060; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4061; AVX-NEXT: vmovaps %xmm0, 208(%r9) 4062; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4063; AVX-NEXT: vmovaps %xmm0, 288(%r9) 4064; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4065; AVX-NEXT: vmovaps %xmm0, 304(%r9) 4066; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4067; AVX-NEXT: vmovaps %xmm0, 256(%r9) 4068; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4069; AVX-NEXT: vmovaps %xmm0, 272(%r9) 4070; AVX-NEXT: addq $104, %rsp 4071; AVX-NEXT: vzeroupper 4072; AVX-NEXT: retq 4073; 4074; AVX2-LABEL: store_i8_stride5_vf64: 4075; AVX2: # %bb.0: 4076; AVX2-NEXT: subq $248, %rsp 4077; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 4078; AVX2-NEXT: vmovdqa (%rcx), %xmm1 4079; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4080; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7 4081; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4082; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 4083; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 4084; AVX2-NEXT: vmovdqa (%rdx), %xmm3 4085; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4086; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10 4087; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4088; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 4089; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 4090; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 4091; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] 4092; AVX2-NEXT: vmovdqa (%rdi), %xmm5 4093; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4094; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 4095; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 4096; AVX2-NEXT: vmovdqa (%rsi), %xmm6 4097; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 4098; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm9 4099; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 4100; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] 4101; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 4102; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm1 4103; AVX2-NEXT: vmovdqa (%r8), %xmm5 4104; AVX2-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill 4105; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] 4106; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] 4107; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 4108; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 4109; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4110; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5 4111; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4112; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 4113; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm1 4114; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 4115; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 4116; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4117; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm1 4118; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 4119; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 4120; AVX2-NEXT: vmovdqa 32(%rsi), %ymm11 4121; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 4122; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] 4123; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 4124; AVX2-NEXT: vmovdqa 32(%r8), %xmm1 4125; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4126; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 4127; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] 4128; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 4129; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4130; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] 4131; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm1 4132; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4133; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] 4134; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 4135; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm3 4136; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 4137; AVX2-NEXT: vmovdqa 32(%rdx), %ymm12 4138; AVX2-NEXT: vmovdqa 32(%rcx), %ymm14 4139; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 4140; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 4141; AVX2-NEXT: vpshufb %ymm3, %ymm14, %ymm8 4142; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] 4143; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm10 4144; AVX2-NEXT: vpor %ymm8, %ymm10, %ymm8 4145; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 4146; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 4147; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] 4148; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm2 4149; AVX2-NEXT: vmovdqa (%rdi), %ymm9 4150; AVX2-NEXT: vpshufb %ymm15, %ymm9, %ymm1 4151; AVX2-NEXT: vmovdqa (%rsi), %ymm15 4152; AVX2-NEXT: vpshufb %ymm4, %ymm15, %ymm4 4153; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm4 4154; AVX2-NEXT: vmovdqa (%rcx), %ymm7 4155; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm0 4156; AVX2-NEXT: vmovdqa (%rdx), %ymm3 4157; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm5 4158; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 4159; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] 4160; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 4161; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 4162; AVX2-NEXT: vmovdqa 32(%r8), %ymm10 4163; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,2,1,1,4,6,5,5] 4164; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] 4165; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 4166; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm1 4167; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4168; AVX2-NEXT: vmovdqa (%r8), %ymm8 4169; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,2,1,1,4,6,5,5] 4170; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] 4171; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 4172; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4173; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,0,4,4,4,4] 4174; AVX2-NEXT: vpermd %ymm13, %ymm2, %ymm4 4175; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 4176; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0 4177; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] 4178; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 4179; AVX2-NEXT: vpermd %ymm9, %ymm2, %ymm2 4180; AVX2-NEXT: vpshufb %ymm5, %ymm15, %ymm4 4181; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1 4182; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 4183; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm4 4184; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] 4185; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm13 4186; AVX2-NEXT: vpor %ymm4, %ymm13, %ymm4 4187; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] 4188; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm0 4189; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm2 4190; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm4 4191; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 4192; AVX2-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 4193; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] 4194; AVX2-NEXT: vpermd %ymm10, %ymm2, %ymm4 4195; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 4196; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 4197; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4198; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm0 4199; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 4200; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4201; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4202; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 4203; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4204; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4205; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 4206; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 4207; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm0 4208; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] 4209; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 4210; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm0 4211; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] 4212; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] 4213; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm4 4214; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4215; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4216; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] 4217; AVX2-NEXT: vpshufb %xmm13, %xmm1, %xmm1 4218; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4219; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 4220; AVX2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 4221; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 4222; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] 4223; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] 4224; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 4225; AVX2-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload 4226; AVX2-NEXT: # xmm1 = mem[0,0,1,1] 4227; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] 4228; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 4229; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm4 4230; AVX2-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4231; AVX2-NEXT: # xmm1 = mem[0,0,1,1] 4232; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] 4233; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 4234; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] 4235; AVX2-NEXT: vpshufb %ymm0, %ymm14, %ymm1 4236; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 4237; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] 4238; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] 4239; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 4240; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 4241; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm0 4242; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 4243; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] 4244; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 4245; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] 4246; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm3 4247; AVX2-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload 4248; AVX2-NEXT: # ymm5 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 4249; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] 4250; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] 4251; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 4252; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 4253; AVX2-NEXT: vpshufb %ymm2, %ymm15, %ymm2 4254; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm9[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 4255; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] 4256; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2 4257; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 4258; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 4259; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] 4260; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 4261; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 4262; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 4263; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 4264; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,3,3,6,6,7,7] 4265; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 4266; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 4267; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 4268; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,3,3,6,6,7,7] 4269; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 4270; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 4271; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4272; AVX2-NEXT: vmovaps %ymm2, 64(%r9) 4273; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4274; AVX2-NEXT: vmovaps %ymm2, 224(%r9) 4275; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4276; AVX2-NEXT: vmovaps %ymm2, 96(%r9) 4277; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4278; AVX2-NEXT: vmovaps %ymm2, 256(%r9) 4279; AVX2-NEXT: vmovdqa %ymm0, 128(%r9) 4280; AVX2-NEXT: vmovdqa %ymm6, 160(%r9) 4281; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4282; AVX2-NEXT: vmovaps %ymm0, 192(%r9) 4283; AVX2-NEXT: vmovdqa %ymm1, 288(%r9) 4284; AVX2-NEXT: vmovdqa %ymm4, (%r9) 4285; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4286; AVX2-NEXT: vmovaps %ymm0, 32(%r9) 4287; AVX2-NEXT: addq $248, %rsp 4288; AVX2-NEXT: vzeroupper 4289; AVX2-NEXT: retq 4290; 4291; AVX2-FP-LABEL: store_i8_stride5_vf64: 4292; AVX2-FP: # %bb.0: 4293; AVX2-FP-NEXT: subq $200, %rsp 4294; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm12 4295; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 4296; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 4297; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm8 4298; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4299; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 4300; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 4301; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 4302; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4303; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 4304; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4305; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 4306; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 4307; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 4308; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] 4309; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 4310; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4311; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 4312; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 4313; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 4314; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4315; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 4316; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 4317; AVX2-FP-NEXT: vpor %xmm4, %xmm6, %xmm4 4318; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 4319; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 4320; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 4321; AVX2-FP-NEXT: vmovdqa (%r8), %xmm4 4322; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4323; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] 4324; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] 4325; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 4326; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 4327; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4328; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 4329; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4330; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 4331; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 4332; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 4333; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm2 4334; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4335; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 4336; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 4337; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 4338; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm14 4339; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 4340; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] 4341; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 4342; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm1 4343; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4344; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] 4345; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] 4346; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 4347; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4348; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] 4349; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm1 4350; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] 4351; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] 4352; AVX2-FP-NEXT: vpshufb %ymm8, %ymm14, %ymm3 4353; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4354; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 4355; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 4356; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm11 4357; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] 4358; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 4359; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] 4360; AVX2-FP-NEXT: # ymm9 = mem[0,1,0,1] 4361; AVX2-FP-NEXT: vpshufb %ymm9, %ymm11, %ymm7 4362; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6 4363; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 4364; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] 4365; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] 4366; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3 4367; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 4368; AVX2-FP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 4369; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm7 4370; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm1 4371; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm8 4372; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 4373; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm5 4374; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 4375; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm9 4376; AVX2-FP-NEXT: vpor %ymm5, %ymm9, %ymm5 4377; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 4378; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] 4379; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm5, %ymm8 4380; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm5 4381; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] 4382; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 4383; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 4384; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm0 4385; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4386; AVX2-FP-NEXT: vmovdqa (%r8), %ymm3 4387; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7] 4388; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 4389; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm0 4390; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4391; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] 4392; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm9 4393; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] 4394; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] 4395; AVX2-FP-NEXT: vpshufb %ymm10, %ymm11, %ymm15 4396; AVX2-FP-NEXT: vpor %ymm9, %ymm15, %ymm9 4397; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 4398; AVX2-FP-NEXT: # ymm15 = mem[0,1,0,1] 4399; AVX2-FP-NEXT: vpshufb %ymm15, %ymm14, %ymm0 4400; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] 4401; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm14 4402; AVX2-FP-NEXT: vpor %ymm0, %ymm14, %ymm0 4403; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] 4404; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 4405; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] 4406; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0 4407; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm8 4408; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm9 4409; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 4410; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm9 4411; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 4412; AVX2-FP-NEXT: vpor %ymm6, %ymm9, %ymm6 4413; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 4414; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] 4415; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6 4416; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5] 4417; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] 4418; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 4419; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm10 4420; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,2,1,1,4,6,5,5] 4421; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] 4422; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm9 4423; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,3,3,0,4,4,4,4] 4424; AVX2-FP-NEXT: vpermd %ymm4, %ymm0, %ymm4 4425; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 4426; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm8 4427; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] 4428; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm8, %ymm4 4429; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm0 4430; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 4431; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 4432; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 4433; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 4434; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 4435; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] 4436; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm8 4437; AVX2-FP-NEXT: vpor %ymm2, %ymm8, %ymm2 4438; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] 4439; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2 4440; AVX2-FP-NEXT: vpshufb %ymm1, %ymm7, %ymm1 4441; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm4 4442; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1 4443; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 4444; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,3,0,4,4,4] 4445; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm0 4446; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 4447; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 4448; AVX2-FP-NEXT: vpermd %ymm3, %ymm4, %ymm2 4449; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 4450; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4451; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 4452; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 4453; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 4454; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 4455; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] 4456; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 4457; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4458; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] 4459; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 4460; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 4461; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] 4462; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] 4463; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 4464; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4465; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 4466; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] 4467; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 4468; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4469; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 4470; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] 4471; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 4472; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] 4473; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 4474; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 4475; AVX2-FP-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4476; AVX2-FP-NEXT: # xmm4 = mem[0,0,1,1] 4477; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] 4478; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 4479; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 4480; AVX2-FP-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 4481; AVX2-FP-NEXT: # xmm4 = mem[0,0,1,1] 4482; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] 4483; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 4484; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%r9) 4485; AVX2-FP-NEXT: vmovdqa %ymm0, 224(%r9) 4486; AVX2-FP-NEXT: vmovdqa %ymm9, 96(%r9) 4487; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4488; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r9) 4489; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4490; AVX2-FP-NEXT: vmovaps %ymm0, 288(%r9) 4491; AVX2-FP-NEXT: vmovdqa %ymm10, 256(%r9) 4492; AVX2-FP-NEXT: vmovdqa %ymm3, 160(%r9) 4493; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4494; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r9) 4495; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) 4496; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4497; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) 4498; AVX2-FP-NEXT: addq $200, %rsp 4499; AVX2-FP-NEXT: vzeroupper 4500; AVX2-FP-NEXT: retq 4501; 4502; AVX2-FCP-LABEL: store_i8_stride5_vf64: 4503; AVX2-FCP: # %bb.0: 4504; AVX2-FCP-NEXT: subq $168, %rsp 4505; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 4506; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 4507; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11 4508; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 4509; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4510; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 4511; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4512; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 4513; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 4514; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm3 4515; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4516; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 4517; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4518; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 4519; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 4520; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 4521; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] 4522; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 4523; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4524; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 4525; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 4526; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm7 4527; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4528; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 4529; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 4530; AVX2-FCP-NEXT: vpor %xmm4, %xmm7, %xmm4 4531; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 4532; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] 4533; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 4534; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 4535; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4536; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 4537; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 4538; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 4539; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm6 4540; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4541; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 4542; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm3 4543; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 4544; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] 4545; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 4546; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] 4547; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 4548; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm2 4549; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] 4550; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 4551; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4552; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm13 4553; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm1 4554; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 4555; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4556; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] 4557; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm1 4558; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] 4559; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 4560; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm2 4561; AVX2-FCP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill 4562; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 4563; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 4564; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 4565; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] 4566; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm4 4567; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] 4568; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 4569; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 4570; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 4571; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 4572; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] 4573; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] 4574; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4 4575; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm12 4576; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0 4577; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 4578; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm1 4579; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm10 4580; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 4581; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm9 4582; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm0 4583; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm8 4584; AVX2-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 4585; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,3,3] 4586; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] 4587; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm7 4588; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] 4589; AVX2-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm9 4590; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 4591; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4 4592; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4593; AVX2-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm4 4594; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4 4595; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4596; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] 4597; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm7 4598; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] 4599; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 4600; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm9 4601; AVX2-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 4602; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 4603; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] 4604; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm10 4605; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] 4606; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm15 4607; AVX2-FCP-NEXT: vpor %ymm10, %ymm15, %ymm10 4608; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 4609; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] 4610; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] 4611; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm7, %ymm10, %ymm7 4612; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm6 4613; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm8 4614; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 4615; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm8 4616; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm4 4617; AVX2-FCP-NEXT: vpor %ymm4, %ymm8, %ymm4 4618; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] 4619; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] 4620; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 4621; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] 4622; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm8 4623; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] 4624; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm9 4625; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm6 4626; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm7 4627; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,0,4,4,4,4] 4628; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 4629; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 4630; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 4631; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] 4632; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 4633; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 4634; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 4635; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 4636; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 4637; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload 4638; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 4639; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] 4640; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm6 4641; AVX2-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3 4642; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] 4643; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 4644; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 4645; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 4646; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 4647; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 4648; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,3,0,4,4,4] 4649; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm0 4650; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 4651; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 4652; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm2 4653; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 4654; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4655; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 4656; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] 4657; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4658; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 4659; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] 4660; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 4661; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4662; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] 4663; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 4664; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 4665; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] 4666; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] 4667; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 4668; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4669; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 4670; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] 4671; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 4672; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4673; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload 4674; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] 4675; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 4676; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] 4677; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 4678; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 4679; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] 4680; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm5 4681; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] 4682; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 4683; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm4 4684; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 4685; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%r9) 4686; AVX2-FCP-NEXT: vmovdqa %ymm0, 224(%r9) 4687; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%r9) 4688; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4689; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) 4690; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4691; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%r9) 4692; AVX2-FCP-NEXT: vmovdqa %ymm9, 256(%r9) 4693; AVX2-FCP-NEXT: vmovdqa %ymm3, 160(%r9) 4694; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4695; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) 4696; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) 4697; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4698; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) 4699; AVX2-FCP-NEXT: addq $168, %rsp 4700; AVX2-FCP-NEXT: vzeroupper 4701; AVX2-FCP-NEXT: retq 4702; 4703; AVX512-LABEL: store_i8_stride5_vf64: 4704; AVX512: # %bb.0: 4705; AVX512-NEXT: vmovdqa 32(%rsi), %ymm11 4706; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] 4707; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0 4708; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 4709; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 4710; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] 4711; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm1 4712; AVX512-NEXT: vmovdqa64 %ymm2, %ymm19 4713; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm20 4714; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12 4715; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 4716; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm0 4717; AVX512-NEXT: vmovdqa64 %xmm1, %xmm28 4718; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10 4719; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 4720; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm1 4721; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29 4722; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm21 4723; AVX512-NEXT: vmovdqa 32(%rcx), %ymm15 4724; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 4725; AVX512-NEXT: vpshufb %ymm8, %ymm15, %ymm0 4726; AVX512-NEXT: vmovdqa 32(%rdx), %ymm13 4727; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] 4728; AVX512-NEXT: vpshufb %ymm3, %ymm13, %ymm1 4729; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm22 4730; AVX512-NEXT: vmovdqa 32(%rcx), %xmm6 4731; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 4732; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm0 4733; AVX512-NEXT: vmovdqa64 %xmm1, %xmm30 4734; AVX512-NEXT: vmovdqa 32(%rdx), %xmm7 4735; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 4736; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm1 4737; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31 4738; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm23 4739; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] 4740; AVX512-NEXT: # ymm9 = mem[0,1,0,1] 4741; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm0 4742; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] 4743; AVX512-NEXT: # ymm4 = mem[0,1,0,1] 4744; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm1 4745; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 4746; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] 4747; AVX512-NEXT: # ymm5 = mem[0,1,0,1] 4748; AVX512-NEXT: vpshufb %ymm5, %ymm11, %ymm1 4749; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] 4750; AVX512-NEXT: # ymm2 = mem[0,1,0,1] 4751; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm11 4752; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 4753; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] 4754; AVX512-NEXT: # ymm11 = mem[0,1,0,1] 4755; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm1 4756; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 4757; AVX512-NEXT: # ymm0 = mem[0,1,0,1] 4758; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm14 4759; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm14, %zmm25 4760; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] 4761; AVX512-NEXT: # ymm1 = mem[0,1,0,1] 4762; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm14 4763; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] 4764; AVX512-NEXT: # ymm15 = mem[0,1,0,1] 4765; AVX512-NEXT: vpshufb %ymm15, %ymm13, %ymm13 4766; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27 4767; AVX512-NEXT: vmovdqa (%rcx), %ymm13 4768; AVX512-NEXT: vpshufb %ymm8, %ymm13, %ymm8 4769; AVX512-NEXT: vmovdqa (%rdx), %ymm14 4770; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm3 4771; AVX512-NEXT: vporq %ymm8, %ymm3, %ymm16 4772; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0 4773; AVX512-NEXT: vpshufb %ymm15, %ymm14, %ymm3 4774; AVX512-NEXT: vporq %ymm0, %ymm3, %ymm17 4775; AVX512-NEXT: vmovdqa (%rsi), %ymm3 4776; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 4777; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 4778; AVX512-NEXT: vmovdqa (%rdi), %ymm8 4779; AVX512-NEXT: vmovdqa64 %ymm19, %ymm15 4780; AVX512-NEXT: vpshufb %ymm15, %ymm8, %ymm15 4781; AVX512-NEXT: vporq %ymm0, %ymm15, %ymm18 4782; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm0 4783; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 4784; AVX512-NEXT: vporq %ymm0, %ymm2, %ymm19 4785; AVX512-NEXT: vpshufb %ymm11, %ymm14, %ymm0 4786; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm1 4787; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 4788; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm1 4789; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm2 4790; AVX512-NEXT: vmovdqa (%rdi), %xmm5 4791; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 4792; AVX512-NEXT: vmovdqa (%rsi), %xmm9 4793; AVX512-NEXT: vmovdqa (%rcx), %xmm8 4794; AVX512-NEXT: vmovdqa64 %xmm28, %xmm2 4795; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2 4796; AVX512-NEXT: vmovdqa64 %xmm29, %xmm3 4797; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm3 4798; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm4 4799; AVX512-NEXT: vmovdqa (%rdx), %xmm3 4800; AVX512-NEXT: vmovdqa 32(%r8), %ymm11 4801; AVX512-NEXT: vmovdqa64 %xmm30, %xmm2 4802; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm2 4803; AVX512-NEXT: vmovdqa64 %xmm31, %xmm13 4804; AVX512-NEXT: vpshufb %xmm13, %xmm3, %xmm13 4805; AVX512-NEXT: vpor %xmm2, %xmm13, %xmm13 4806; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] 4807; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm2 4808; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = mem[1,1,2,2] 4809; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] 4810; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 4811; AVX512-NEXT: vpandnq %ymm15, %ymm28, %ymm15 4812; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 4813; AVX512-NEXT: vmovdqa (%r8), %ymm15 4814; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm14 4815; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5] 4816; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 4817; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,3,2] 4818; AVX512-NEXT: vpandnq %ymm15, %ymm29, %ymm15 4819; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 4820; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 4821; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 4822; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 4823; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 4824; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] 4825; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 4826; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm10 4827; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] 4828; AVX512-NEXT: vmovdqa64 (%r8), %zmm15 4829; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] 4830; AVX512-NEXT: vpermd %zmm11, %zmm30, %zmm30 4831; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17] 4832; AVX512-NEXT: vpermi2d %zmm11, %zmm15, %zmm31 4833; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 4834; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm5 4835; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 4836; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] 4837; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3 4838; AVX512-NEXT: vinserti32x4 $2, %xmm13, %zmm3, %zmm3 4839; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,0,1,1] 4840; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm5, %zmm5 4841; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1] 4842; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm7 4843; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 4844; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) 4845; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm28) 4846; AVX512-NEXT: vporq %zmm24, %zmm26, %zmm5 4847; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] 4848; AVX512-NEXT: vporq %zmm25, %zmm27, %zmm7 4849; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7] 4850; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 4851; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm9 & (zmm7 ^ zmm5)) 4852; AVX512-NEXT: vpternlogd {{.*#+}} zmm30 = zmm30 ^ (zmm29 & (zmm30 ^ zmm7)) 4853; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm17[2,2,3,3] 4854; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm16, %zmm5 4855; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm19[2,2,3,3] 4856; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 4857; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) 4858; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 4859; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 4860; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 4861; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 4862; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm9 & (zmm1 ^ zmm0)) 4863; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem) 4864; AVX512-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm1)) 4865; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm4[0,0,1,1,4,4,5,5] 4866; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5] 4867; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) 4868; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] 4869; AVX512-NEXT: vpermd %zmm15, %zmm0, %zmm0 4870; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 4871; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r9) 4872; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) 4873; AVX512-NEXT: vmovdqa64 %zmm31, 128(%r9) 4874; AVX512-NEXT: vmovdqa64 %zmm30, 256(%r9) 4875; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r9) 4876; AVX512-NEXT: vzeroupper 4877; AVX512-NEXT: retq 4878; 4879; AVX512-FCP-LABEL: store_i8_stride5_vf64: 4880; AVX512-FCP: # %bb.0: 4881; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 4882; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] 4883; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1 4884; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 4885; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] 4886; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 4887; AVX512-FCP-NEXT: vporq %ymm1, %ymm3, %ymm17 4888; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 4889; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] 4890; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 4891; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 4892; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm4 4893; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm28 4894; AVX512-FCP-NEXT: vporq %xmm1, %xmm4, %xmm18 4895; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 4896; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 4897; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm4 4898; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm11 4899; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] 4900; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm6 4901; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm19 4902; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 4903; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12] 4904; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm7 4905; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 4906; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm8 4907; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm29 4908; AVX512-FCP-NEXT: vporq %xmm4, %xmm8, %xmm20 4909; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] 4910; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] 4911; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4 4912; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm31 4913; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] 4914; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] 4915; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 4916; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm21 4917; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] 4918; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] 4919; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm8 4920; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 4921; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] 4922; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] 4923; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4924; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm22 4925; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] 4926; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 4927; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm8 4928; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 4929; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] 4930; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm15 4931; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm24 4932; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] 4933; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] 4934; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 4935; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] 4936; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4937; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm11 4938; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 4939; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 4940; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm10 4941; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm11 4942; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm12 4943; AVX512-FCP-NEXT: vporq %ymm10, %ymm12, %ymm23 4944; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 4945; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm10 4946; AVX512-FCP-NEXT: vporq %ymm0, %ymm10, %ymm25 4947; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm10 4948; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm0 4949; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 4950; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9 4951; AVX512-FCP-NEXT: vporq %ymm0, %ymm9, %ymm27 4952; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm0 4953; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 4954; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 4955; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 4956; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11] 4957; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm14 4958; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 4959; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 4960; AVX512-FCP-NEXT: vporq %xmm0, %xmm2, %xmm28 4961; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm13 4962; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12] 4963; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 4964; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm2 4965; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm2 4966; AVX512-FCP-NEXT: vporq %xmm0, %xmm2, %xmm29 4967; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm0 4968; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 4969; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 4970; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 4971; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm1 4972; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 4973; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 4974; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 4975; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 4976; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] 4977; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 4978; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2] 4979; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 4980; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 4981; AVX512-FCP-NEXT: vpandn %ymm10, %ymm11, %ymm10 4982; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 4983; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 4984; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] 4985; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 4986; AVX512-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm31 4987; AVX512-FCP-NEXT: vpandnq %ymm31, %ymm30, %ymm31 4988; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 4989; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm8, %zmm8 4990; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] 4991; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 4992; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 4993; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm9, %zmm9 4994; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] 4995; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 4996; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 4997; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm29, %zmm13, %zmm13 4998; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm28 4999; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm10 5000; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 5001; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 5002; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9] 5003; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 5004; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 5005; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 5006; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 5007; AVX512-FCP-NEXT: vporq %zmm21, %zmm22, %zmm3 5008; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7] 5009; AVX512-FCP-NEXT: vporq %zmm24, %zmm26, %zmm5 5010; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] 5011; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 5012; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3)) 5013; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm12, %zmm4 5014; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm30 & (zmm4 ^ zmm5)) 5015; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%r9) 5016; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1] 5017; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 5018; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1] 5019; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 5020; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 5021; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 5022; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm4 & zmm11) 5023; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3] 5024; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 5025; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3] 5026; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 5027; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 5028; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm6 & (zmm1 ^ zmm0)) 5029; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem) 5030; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5] 5031; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5] 5032; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) 5033; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] 5034; AVX512-FCP-NEXT: vpermd %zmm10, %zmm0, %zmm0 5035; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) 5036; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] 5037; AVX512-FCP-NEXT: vpermd %zmm28, %zmm3, %zmm3 5038; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) 5039; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) 5040; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) 5041; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) 5042; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r9) 5043; AVX512-FCP-NEXT: vzeroupper 5044; AVX512-FCP-NEXT: retq 5045; 5046; AVX512DQ-LABEL: store_i8_stride5_vf64: 5047; AVX512DQ: # %bb.0: 5048; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm11 5049; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] 5050; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0 5051; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 5052; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 5053; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] 5054; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm1 5055; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm19 5056; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm20 5057; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12 5058; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 5059; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm0 5060; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm28 5061; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10 5062; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 5063; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm1 5064; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29 5065; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm21 5066; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15 5067; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 5068; AVX512DQ-NEXT: vpshufb %ymm8, %ymm15, %ymm0 5069; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm13 5070; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] 5071; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm1 5072; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm22 5073; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm6 5074; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 5075; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm0 5076; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm30 5077; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm7 5078; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 5079; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm1 5080; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31 5081; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm23 5082; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] 5083; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] 5084; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm0 5085; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] 5086; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] 5087; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm1 5088; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 5089; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] 5090; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] 5091; AVX512DQ-NEXT: vpshufb %ymm5, %ymm11, %ymm1 5092; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] 5093; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] 5094; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm11 5095; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 5096; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] 5097; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] 5098; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm1 5099; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 5100; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] 5101; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm14 5102; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm14, %zmm25 5103; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] 5104; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] 5105; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm14 5106; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] 5107; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,1] 5108; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm13 5109; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27 5110; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13 5111; AVX512DQ-NEXT: vpshufb %ymm8, %ymm13, %ymm8 5112; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm14 5113; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm3 5114; AVX512DQ-NEXT: vporq %ymm8, %ymm3, %ymm16 5115; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0 5116; AVX512DQ-NEXT: vpshufb %ymm15, %ymm14, %ymm3 5117; AVX512DQ-NEXT: vporq %ymm0, %ymm3, %ymm17 5118; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 5119; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 5120; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 5121; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 5122; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm15 5123; AVX512DQ-NEXT: vpshufb %ymm15, %ymm8, %ymm15 5124; AVX512DQ-NEXT: vporq %ymm0, %ymm15, %ymm18 5125; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm0 5126; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 5127; AVX512DQ-NEXT: vporq %ymm0, %ymm2, %ymm19 5128; AVX512DQ-NEXT: vpshufb %ymm11, %ymm14, %ymm0 5129; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm1 5130; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 5131; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm1 5132; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm2 5133; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 5134; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 5135; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 5136; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 5137; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm2 5138; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2 5139; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm3 5140; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm3 5141; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm4 5142; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 5143; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm11 5144; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm2 5145; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm2 5146; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm13 5147; AVX512DQ-NEXT: vpshufb %xmm13, %xmm3, %xmm13 5148; AVX512DQ-NEXT: vpor %xmm2, %xmm13, %xmm13 5149; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] 5150; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm2 5151; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = mem[1,1,2,2] 5152; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] 5153; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 5154; AVX512DQ-NEXT: vpandnq %ymm15, %ymm28, %ymm15 5155; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 5156; AVX512DQ-NEXT: vmovdqa (%r8), %ymm15 5157; AVX512DQ-NEXT: vpshufb %ymm14, %ymm15, %ymm14 5158; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5] 5159; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 5160; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,3,2] 5161; AVX512DQ-NEXT: vpandnq %ymm15, %ymm29, %ymm15 5162; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 5163; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 5164; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 5165; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 5166; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 5167; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] 5168; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 5169; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm10 5170; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] 5171; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm15 5172; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] 5173; AVX512DQ-NEXT: vpermd %zmm11, %zmm30, %zmm30 5174; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17] 5175; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm15, %zmm31 5176; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 5177; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm5 5178; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 5179; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] 5180; AVX512DQ-NEXT: vpshufb %xmm7, %xmm3, %xmm3 5181; AVX512DQ-NEXT: vinserti32x4 $2, %xmm13, %zmm3, %zmm3 5182; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,0,1,1] 5183; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm5, %zmm5 5184; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1] 5185; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm7 5186; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 5187; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) 5188; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm28) 5189; AVX512DQ-NEXT: vporq %zmm24, %zmm26, %zmm5 5190; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] 5191; AVX512DQ-NEXT: vporq %zmm25, %zmm27, %zmm7 5192; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7] 5193; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 5194; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm9 & (zmm7 ^ zmm5)) 5195; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm30 = zmm30 ^ (zmm29 & (zmm30 ^ zmm7)) 5196; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm17[2,2,3,3] 5197; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm16, %zmm5 5198; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm19[2,2,3,3] 5199; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 5200; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) 5201; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 5202; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 5203; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] 5204; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 5205; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm9 & (zmm1 ^ zmm0)) 5206; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem) 5207; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm1)) 5208; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm4[0,0,1,1,4,4,5,5] 5209; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5] 5210; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) 5211; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] 5212; AVX512DQ-NEXT: vpermd %zmm15, %zmm0, %zmm0 5213; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 5214; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r9) 5215; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) 5216; AVX512DQ-NEXT: vmovdqa64 %zmm31, 128(%r9) 5217; AVX512DQ-NEXT: vmovdqa64 %zmm30, 256(%r9) 5218; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r9) 5219; AVX512DQ-NEXT: vzeroupper 5220; AVX512DQ-NEXT: retq 5221; 5222; AVX512DQ-FCP-LABEL: store_i8_stride5_vf64: 5223; AVX512DQ-FCP: # %bb.0: 5224; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 5225; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] 5226; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1 5227; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 5228; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] 5229; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 5230; AVX512DQ-FCP-NEXT: vporq %ymm1, %ymm3, %ymm17 5231; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 5232; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] 5233; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 5234; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 5235; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm4 5236; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm28 5237; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm4, %xmm18 5238; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 5239; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 5240; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm4 5241; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm11 5242; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] 5243; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm6 5244; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm6, %ymm19 5245; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 5246; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12] 5247; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm7 5248; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 5249; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm8 5250; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm29 5251; AVX512DQ-FCP-NEXT: vporq %xmm4, %xmm8, %xmm20 5252; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] 5253; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] 5254; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4 5255; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm31 5256; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] 5257; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] 5258; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 5259; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm21 5260; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] 5261; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] 5262; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm8 5263; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 5264; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] 5265; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] 5266; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 5267; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm22 5268; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] 5269; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 5270; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm8 5271; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 5272; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] 5273; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm15 5274; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm24 5275; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] 5276; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] 5277; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 5278; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] 5279; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] 5280; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm11 5281; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 5282; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 5283; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm10 5284; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm11 5285; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm12 5286; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm12, %ymm23 5287; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 5288; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm10 5289; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm10, %ymm25 5290; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm10 5291; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm0 5292; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 5293; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9 5294; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm9, %ymm27 5295; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm0 5296; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 5297; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 5298; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 5299; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11] 5300; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm14 5301; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 5302; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 5303; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm2, %xmm28 5304; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm13 5305; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12] 5306; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 5307; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm2 5308; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm2 5309; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm2, %xmm29 5310; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm0 5311; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 5312; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 5313; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 5314; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm1 5315; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 5316; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 5317; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 5318; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 5319; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] 5320; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 5321; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2] 5322; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 5323; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] 5324; AVX512DQ-FCP-NEXT: vpandn %ymm10, %ymm11, %ymm10 5325; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 5326; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm10 5327; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] 5328; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] 5329; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm31 5330; AVX512DQ-FCP-NEXT: vpandnq %ymm31, %ymm30, %ymm31 5331; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 5332; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm8, %zmm8 5333; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] 5334; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 5335; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 5336; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm9, %zmm9 5337; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] 5338; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 5339; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 5340; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm29, %zmm13, %zmm13 5341; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm28 5342; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm10 5343; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 5344; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 5345; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9] 5346; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 5347; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 5348; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 5349; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 5350; AVX512DQ-FCP-NEXT: vporq %zmm21, %zmm22, %zmm3 5351; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7] 5352; AVX512DQ-FCP-NEXT: vporq %zmm24, %zmm26, %zmm5 5353; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] 5354; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] 5355; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3)) 5356; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm12, %zmm4 5357; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm30 & (zmm4 ^ zmm5)) 5358; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%r9) 5359; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1] 5360; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 5361; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1] 5362; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 5363; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] 5364; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 5365; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm4 & zmm11) 5366; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3] 5367; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 5368; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3] 5369; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 5370; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) 5371; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm6 & (zmm1 ^ zmm0)) 5372; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem) 5373; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5] 5374; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5] 5375; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) 5376; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] 5377; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm0, %zmm0 5378; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) 5379; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] 5380; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm3, %zmm3 5381; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) 5382; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) 5383; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) 5384; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) 5385; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r9) 5386; AVX512DQ-FCP-NEXT: vzeroupper 5387; AVX512DQ-FCP-NEXT: retq 5388; 5389; AVX512BW-LABEL: store_i8_stride5_vf64: 5390; AVX512BW: # %bb.0: 5391; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 5392; AVX512BW-NEXT: vmovdqa (%rcx), %ymm0 5393; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] 5394; AVX512BW-NEXT: vpshufb %ymm8, %ymm0, %ymm3 5395; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 5396; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 5397; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] 5398; AVX512BW-NEXT: movl $693250386, %eax # imm = 0x29522952 5399; AVX512BW-NEXT: kmovd %eax, %k1 5400; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} 5401; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 5402; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm6 5403; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm12 5404; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] 5405; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 5406; AVX512BW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 5407; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 5408; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm10 5409; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4 5410; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] 5411; AVX512BW-NEXT: vpshufb %ymm15, %ymm4, %ymm3 5412; AVX512BW-NEXT: vmovdqa (%rdi), %ymm5 5413; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 5414; AVX512BW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] 5415; AVX512BW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 5416; AVX512BW-NEXT: kmovd %eax, %k5 5417; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm3 {%k5} 5418; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 5419; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm13 5420; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm14 5421; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 5422; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 5423; AVX512BW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 5424; AVX512BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] 5425; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 5426; AVX512BW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 5427; AVX512BW-NEXT: kmovq %rax, %k4 5428; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm3 {%k4} 5429; AVX512BW-NEXT: vmovdqa64 32(%r8), %ymm16 5430; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] 5431; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm10 5432; AVX512BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 5433; AVX512BW-NEXT: kmovq %rax, %k2 5434; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm3 {%k2} 5435; AVX512BW-NEXT: vmovdqa64 32(%rdx), %ymm23 5436; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] 5437; AVX512BW-NEXT: vpshufb %ymm10, %ymm23, %ymm17 5438; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm24 5439; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 5440; AVX512BW-NEXT: vpshufb %ymm11, %ymm24, %ymm18 5441; AVX512BW-NEXT: vporq %ymm17, %ymm18, %ymm17 5442; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 5443; AVX512BW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 5444; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 5445; AVX512BW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 5446; AVX512BW-NEXT: vpor %xmm6, %xmm12, %xmm6 5447; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 5448; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 5449; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 5450; AVX512BW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 5451; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 5452; AVX512BW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 5453; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 5454; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] 5455; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm25 5456; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [3,3,3,0,4,4,4,4] 5457; AVX512BW-NEXT: vpermd %ymm25, %ymm12, %ymm17 5458; AVX512BW-NEXT: vmovdqa64 32(%rsi), %ymm26 5459; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 5460; AVX512BW-NEXT: movl $138547332, %eax # imm = 0x8421084 5461; AVX512BW-NEXT: kmovd %eax, %k3 5462; AVX512BW-NEXT: vpshufb %ymm13, %ymm26, %ymm17 {%k3} 5463; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 5464; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 5465; AVX512BW-NEXT: kmovq %rax, %k2 5466; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} 5467; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,3,3,3,0,4,4,4] 5468; AVX512BW-NEXT: vpermd %ymm16, %ymm14, %ymm17 5469; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] 5470; AVX512BW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] 5471; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 5472; AVX512BW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 5473; AVX512BW-NEXT: kmovq %rax, %k6 5474; AVX512BW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} 5475; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] 5476; AVX512BW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] 5477; AVX512BW-NEXT: vpshufb %ymm17, %ymm26, %ymm27 5478; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] 5479; AVX512BW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 5480; AVX512BW-NEXT: vporq %ymm27, %ymm28, %ymm27 5481; AVX512BW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 5482; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 5483; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] 5484; AVX512BW-NEXT: vmovdqu8 %ymm25, %ymm15 {%k5} 5485; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm15 5486; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] 5487; AVX512BW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 5488; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 5489; AVX512BW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] 5490; AVX512BW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 5491; AVX512BW-NEXT: vporq %ymm26, %ymm28, %ymm26 5492; AVX512BW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 5493; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 5494; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] 5495; AVX512BW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} 5496; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 5497; AVX512BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,2,3,3,6,6,7,7] 5498; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7] 5499; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} 5500; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] 5501; AVX512BW-NEXT: vpermd %zmm16, %zmm15, %zmm15 5502; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm16 5503; AVX512BW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 5504; AVX512BW-NEXT: kmovq %rax, %k1 5505; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} 5506; AVX512BW-NEXT: vmovdqa (%rcx), %xmm15 5507; AVX512BW-NEXT: vpshufb %xmm20, %xmm15, %xmm20 5508; AVX512BW-NEXT: vpshufb %xmm22, %xmm16, %xmm22 5509; AVX512BW-NEXT: vporq %xmm20, %xmm22, %xmm20 5510; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] 5511; AVX512BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 5512; AVX512BW-NEXT: vmovdqa (%rsi), %xmm15 5513; AVX512BW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 5514; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 5515; AVX512BW-NEXT: vpshufb %xmm19, %xmm16, %xmm19 5516; AVX512BW-NEXT: vpshufb %xmm21, %xmm15, %xmm20 5517; AVX512BW-NEXT: vporq %xmm19, %xmm20, %xmm19 5518; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] 5519; AVX512BW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 5520; AVX512BW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 5521; AVX512BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] 5522; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5] 5523; AVX512BW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C 5524; AVX512BW-NEXT: kmovq %rax, %k1 5525; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} 5526; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] 5527; AVX512BW-NEXT: vpermd %zmm2, %zmm7, %zmm2 5528; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 5529; AVX512BW-NEXT: kmovq %rax, %k1 5530; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} 5531; AVX512BW-NEXT: vpshufb %ymm10, %ymm1, %ymm2 5532; AVX512BW-NEXT: vpshufb %ymm11, %ymm0, %ymm7 5533; AVX512BW-NEXT: vpor %ymm2, %ymm7, %ymm2 5534; AVX512BW-NEXT: vpshufb %ymm25, %ymm1, %ymm1 5535; AVX512BW-NEXT: vpshufb %ymm27, %ymm0, %ymm0 5536; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 5537; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 5538; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 5539; AVX512BW-NEXT: vpermd %ymm5, %ymm12, %ymm1 5540; AVX512BW-NEXT: vpshufb %ymm13, %ymm4, %ymm1 {%k3} 5541; AVX512BW-NEXT: vpshufb %ymm17, %ymm4, %ymm2 5542; AVX512BW-NEXT: vpshufb %ymm18, %ymm5, %ymm4 5543; AVX512BW-NEXT: vpor %ymm2, %ymm4, %ymm2 5544; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 5545; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 5546; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} 5547; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 5548; AVX512BW-NEXT: vpermd %ymm0, %ymm14, %ymm2 5549; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] 5550; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] 5551; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 5552; AVX512BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 5553; AVX512BW-NEXT: kmovq %rax, %k1 5554; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} 5555; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) 5556; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r9) 5557; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) 5558; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) 5559; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%r9) 5560; AVX512BW-NEXT: vzeroupper 5561; AVX512BW-NEXT: retq 5562; 5563; AVX512BW-FCP-LABEL: store_i8_stride5_vf64: 5564; AVX512BW-FCP: # %bb.0: 5565; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 5566; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 5567; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] 5568; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 5569; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm21 5570; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 5571; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm2 5572; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 5573; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 5574; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 5575; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 5576; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 5577; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 5578; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 5579; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 5580; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm5 5581; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 5582; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] 5583; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 5584; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm13 5585; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 5586; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 5587; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9 5588; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 5589; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 5590; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 5591; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10 5592; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 5593; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1] 5594; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 5595; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] 5596; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm9, %ymm22 5597; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 5598; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 5599; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 5600; AVX512BW-FCP-NEXT: kmovd %eax, %k1 5601; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm22 {%k1} 5602; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 5603; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 5604; AVX512BW-FCP-NEXT: kmovq %rax, %k2 5605; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k2} 5606; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] 5607; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm15, %zmm15 5608; AVX512BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 5609; AVX512BW-FCP-NEXT: kmovq %rax, %k3 5610; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k3} 5611; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] 5612; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] 5613; AVX512BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22 5614; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7] 5615; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] 5616; AVX512BW-FCP-NEXT: vpshufb %zmm16, %zmm23, %zmm23 5617; AVX512BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22 5618; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] 5619; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] 5620; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] 5621; AVX512BW-FCP-NEXT: vpshufb %zmm23, %zmm8, %zmm8 5622; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] 5623; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] 5624; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21 5625; AVX512BW-FCP-NEXT: vporq %zmm8, %zmm21, %zmm8 5626; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7] 5627; AVX512BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 5628; AVX512BW-FCP-NEXT: kmovq %rax, %k3 5629; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k3} 5630; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] 5631; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm21 5632; AVX512BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 5633; AVX512BW-FCP-NEXT: kmovq %rax, %k4 5634; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4} 5635; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14 5636; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17 5637; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14 5638; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 5639; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 5640; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 5641; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11 5642; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] 5643; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11 5644; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm18, %xmm17 5645; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11 5646; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7] 5647; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 5648; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm13 5649; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 5650; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] 5651; AVX512BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C 5652; AVX512BW-FCP-NEXT: kmovq %rax, %k4 5653; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} 5654; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] 5655; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] 5656; AVX512BW-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm14 5657; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 5658; AVX512BW-FCP-NEXT: kmovq %rax, %k4 5659; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} 5660; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 5661; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 5662; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm18 5663; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm18, %ymm7 5664; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 5665; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm14, %ymm7 5666; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm18, %ymm19 5667; AVX512BW-FCP-NEXT: vporq %ymm7, %ymm19, %ymm7 5668; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 5669; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 5670; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 5671; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15 5672; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 5673; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm19, %ymm16 5674; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm15 5675; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] 5676; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 5677; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1} 5678; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 5679; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2} 5680; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] 5681; AVX512BW-FCP-NEXT: vpermd %zmm13, %zmm6, %zmm6 5682; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 5683; AVX512BW-FCP-NEXT: kmovq %rax, %k1 5684; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} 5685; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 5686; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 5687; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero 5688; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero 5689; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 5690; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] 5691; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 5692; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 5693; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm2, %xmm2 5694; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30] 5695; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero 5696; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 5697; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 5698; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3} 5699; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] 5700; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 5701; AVX512BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 5702; AVX512BW-FCP-NEXT: kmovq %rax, %k1 5703; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} 5704; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) 5705; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) 5706; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) 5707; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) 5708; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r9) 5709; AVX512BW-FCP-NEXT: vzeroupper 5710; AVX512BW-FCP-NEXT: retq 5711; 5712; AVX512DQ-BW-LABEL: store_i8_stride5_vf64: 5713; AVX512DQ-BW: # %bb.0: 5714; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 5715; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm0 5716; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] 5717; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm0, %ymm3 5718; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 5719; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 5720; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] 5721; AVX512DQ-BW-NEXT: movl $693250386, %eax # imm = 0x29522952 5722; AVX512DQ-BW-NEXT: kmovd %eax, %k1 5723; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} 5724; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 5725; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm6 5726; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm12 5727; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] 5728; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 5729; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 5730; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] 5731; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm10 5732; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm4 5733; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] 5734; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm4, %ymm3 5735; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm5 5736; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 5737; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] 5738; AVX512DQ-BW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 5739; AVX512DQ-BW-NEXT: kmovd %eax, %k5 5740; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm3 {%k5} 5741; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] 5742; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm13 5743; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm14 5744; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 5745; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 5746; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 5747; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] 5748; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 5749; AVX512DQ-BW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 5750; AVX512DQ-BW-NEXT: kmovq %rax, %k4 5751; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm3 {%k4} 5752; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %ymm16 5753; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] 5754; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm10 5755; AVX512DQ-BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 5756; AVX512DQ-BW-NEXT: kmovq %rax, %k2 5757; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm3 {%k2} 5758; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %ymm23 5759; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] 5760; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm23, %ymm17 5761; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm24 5762; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 5763; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm24, %ymm18 5764; AVX512DQ-BW-NEXT: vporq %ymm17, %ymm18, %ymm17 5765; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 5766; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 5767; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 5768; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 5769; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm12, %xmm6 5770; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] 5771; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 5772; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 5773; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 5774; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 5775; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 5776; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm13, %xmm12 5777; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] 5778; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm25 5779; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [3,3,3,0,4,4,4,4] 5780; AVX512DQ-BW-NEXT: vpermd %ymm25, %ymm12, %ymm17 5781; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %ymm26 5782; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 5783; AVX512DQ-BW-NEXT: movl $138547332, %eax # imm = 0x8421084 5784; AVX512DQ-BW-NEXT: kmovd %eax, %k3 5785; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm26, %ymm17 {%k3} 5786; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 5787; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 5788; AVX512DQ-BW-NEXT: kmovq %rax, %k2 5789; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} 5790; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,3,3,3,0,4,4,4] 5791; AVX512DQ-BW-NEXT: vpermd %ymm16, %ymm14, %ymm17 5792; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] 5793; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] 5794; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 5795; AVX512DQ-BW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 5796; AVX512DQ-BW-NEXT: kmovq %rax, %k6 5797; AVX512DQ-BW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} 5798; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] 5799; AVX512DQ-BW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] 5800; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm26, %ymm27 5801; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] 5802; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 5803; AVX512DQ-BW-NEXT: vporq %ymm27, %ymm28, %ymm27 5804; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 5805; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] 5806; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] 5807; AVX512DQ-BW-NEXT: vmovdqu8 %ymm25, %ymm15 {%k5} 5808; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm15 5809; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] 5810; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 5811; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] 5812; AVX512DQ-BW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] 5813; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 5814; AVX512DQ-BW-NEXT: vporq %ymm26, %ymm28, %ymm26 5815; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 5816; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] 5817; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] 5818; AVX512DQ-BW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} 5819; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 5820; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,2,3,3,6,6,7,7] 5821; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7] 5822; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} 5823; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] 5824; AVX512DQ-BW-NEXT: vpermd %zmm16, %zmm15, %zmm15 5825; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm16 5826; AVX512DQ-BW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 5827; AVX512DQ-BW-NEXT: kmovq %rax, %k1 5828; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} 5829; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm15 5830; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm15, %xmm20 5831; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm16, %xmm22 5832; AVX512DQ-BW-NEXT: vporq %xmm20, %xmm22, %xmm20 5833; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] 5834; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 5835; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm15 5836; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 5837; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm16 5838; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm16, %xmm19 5839; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm15, %xmm20 5840; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm20, %xmm19 5841; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] 5842; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 5843; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 5844; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] 5845; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5] 5846; AVX512DQ-BW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C 5847; AVX512DQ-BW-NEXT: kmovq %rax, %k1 5848; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} 5849; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] 5850; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm7, %zmm2 5851; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 5852; AVX512DQ-BW-NEXT: kmovq %rax, %k1 5853; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} 5854; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm1, %ymm2 5855; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm0, %ymm7 5856; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm7, %ymm2 5857; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm1, %ymm1 5858; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm0, %ymm0 5859; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 5860; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] 5861; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 5862; AVX512DQ-BW-NEXT: vpermd %ymm5, %ymm12, %ymm1 5863; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm4, %ymm1 {%k3} 5864; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm4, %ymm2 5865; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm5, %ymm4 5866; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm4, %ymm2 5867; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] 5868; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 5869; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} 5870; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 5871; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm14, %ymm2 5872; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] 5873; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] 5874; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 5875; AVX512DQ-BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 5876; AVX512DQ-BW-NEXT: kmovq %rax, %k1 5877; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} 5878; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%r9) 5879; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%r9) 5880; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%r9) 5881; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%r9) 5882; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 128(%r9) 5883; AVX512DQ-BW-NEXT: vzeroupper 5884; AVX512DQ-BW-NEXT: retq 5885; 5886; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf64: 5887; AVX512DQ-BW-FCP: # %bb.0: 5888; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 5889; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 5890; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] 5891; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 5892; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm21 5893; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] 5894; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm2 5895; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 5896; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 5897; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 5898; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] 5899; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 5900; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 5901; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 5902; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] 5903; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm5 5904; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 5905; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] 5906; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 5907; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm13 5908; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 5909; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] 5910; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9 5911; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 5912; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 5913; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] 5914; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10 5915; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 5916; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1] 5917; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 5918; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] 5919; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm9, %ymm22 5920; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 5921; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] 5922; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 5923; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 5924; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm22 {%k1} 5925; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 5926; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 5927; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 5928; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k2} 5929; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] 5930; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm15, %zmm15 5931; AVX512DQ-BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 5932; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 5933; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k3} 5934; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] 5935; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] 5936; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22 5937; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7] 5938; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] 5939; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm16, %zmm23, %zmm23 5940; AVX512DQ-BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22 5941; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] 5942; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] 5943; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] 5944; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm23, %zmm8, %zmm8 5945; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] 5946; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] 5947; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21 5948; AVX512DQ-BW-FCP-NEXT: vporq %zmm8, %zmm21, %zmm8 5949; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7] 5950; AVX512DQ-BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 5951; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 5952; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k3} 5953; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] 5954; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm21 5955; AVX512DQ-BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 5956; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 5957; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4} 5958; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14 5959; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17 5960; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14 5961; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 5962; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] 5963; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 5964; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11 5965; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] 5966; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11 5967; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm18, %xmm17 5968; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11 5969; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7] 5970; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] 5971; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm13 5972; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 5973; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] 5974; AVX512DQ-BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C 5975; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 5976; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} 5977; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] 5978; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] 5979; AVX512DQ-BW-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm14 5980; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 5981; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 5982; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} 5983; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 5984; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 5985; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm18 5986; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm18, %ymm7 5987; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 5988; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm14, %ymm7 5989; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm18, %ymm19 5990; AVX512DQ-BW-FCP-NEXT: vporq %ymm7, %ymm19, %ymm7 5991; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] 5992; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 5993; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 5994; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15 5995; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 5996; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm19, %ymm16 5997; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm15 5998; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] 5999; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 6000; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1} 6001; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 6002; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2} 6003; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] 6004; AVX512DQ-BW-FCP-NEXT: vpermd %zmm13, %zmm6, %zmm6 6005; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 6006; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 6007; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} 6008; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 6009; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 6010; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero 6011; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero 6012; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 6013; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] 6014; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 6015; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 6016; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm2, %xmm2 6017; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30] 6018; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero 6019; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 6020; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 6021; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3} 6022; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] 6023; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 6024; AVX512DQ-BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 6025; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 6026; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} 6027; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) 6028; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) 6029; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) 6030; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) 6031; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r9) 6032; AVX512DQ-BW-FCP-NEXT: vzeroupper 6033; AVX512DQ-BW-FCP-NEXT: retq 6034 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 6035 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 6036 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 6037 %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64 6038 %in.vec4 = load <64 x i8>, ptr %in.vecptr4, align 64 6039 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6040 %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 6041 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 6042 %4 = shufflevector <64 x i8> %in.vec4, <64 x i8> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 6043 %5 = shufflevector <256 x i8> %3, <256 x i8> %4, <320 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319> 6044 %interleaved.vec = shufflevector <320 x i8> %5, <320 x i8> poison, <320 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 1, i32 65, i32 129, i32 193, i32 257, i32 2, i32 66, i32 130, i32 194, i32 258, i32 3, i32 67, i32 131, i32 195, i32 259, i32 4, i32 68, i32 132, i32 196, i32 260, i32 5, i32 69, i32 133, i32 197, i32 261, i32 6, i32 70, i32 134, i32 198, i32 262, i32 7, i32 71, i32 135, i32 199, i32 263, i32 8, i32 72, i32 136, i32 200, i32 264, i32 9, i32 73, i32 137, i32 201, i32 265, i32 10, i32 74, i32 138, i32 202, i32 266, i32 11, i32 75, i32 139, i32 203, i32 267, i32 12, i32 76, i32 140, i32 204, i32 268, i32 13, i32 77, i32 141, i32 205, i32 269, i32 14, i32 78, i32 142, i32 206, i32 270, i32 15, i32 79, i32 143, i32 207, i32 271, i32 16, i32 80, i32 144, i32 208, i32 272, i32 17, i32 81, i32 145, i32 209, i32 273, i32 18, i32 82, i32 146, i32 210, i32 274, i32 19, i32 83, i32 147, i32 211, i32 275, i32 20, i32 84, i32 148, i32 212, i32 276, i32 21, i32 85, i32 149, i32 213, i32 277, i32 22, i32 86, i32 150, i32 214, i32 278, i32 23, i32 87, i32 151, i32 215, i32 279, i32 24, i32 88, i32 152, i32 216, i32 280, i32 25, i32 89, i32 153, i32 217, i32 281, i32 26, i32 90, i32 154, i32 218, i32 282, i32 27, i32 91, i32 155, i32 219, i32 283, i32 28, i32 92, i32 156, i32 220, i32 284, i32 29, i32 93, i32 157, i32 221, i32 285, i32 30, i32 94, i32 158, i32 222, i32 286, i32 31, i32 95, i32 159, i32 223, i32 287, i32 32, i32 96, i32 160, i32 224, i32 288, i32 33, i32 97, i32 161, i32 225, i32 289, i32 34, i32 98, i32 162, i32 226, i32 290, i32 35, i32 99, i32 163, i32 227, i32 291, i32 36, i32 100, i32 164, i32 228, i32 292, i32 37, i32 101, i32 165, i32 229, i32 293, i32 38, i32 102, i32 166, i32 230, i32 294, i32 39, i32 103, i32 167, i32 231, i32 295, i32 40, i32 104, i32 168, i32 232, i32 296, i32 41, i32 105, i32 169, i32 233, i32 297, i32 42, i32 106, i32 170, i32 234, i32 298, i32 43, i32 107, i32 171, i32 235, i32 299, i32 44, i32 108, i32 172, i32 236, i32 300, i32 45, i32 109, i32 173, i32 237, i32 301, i32 46, i32 110, i32 174, i32 238, i32 302, i32 47, i32 111, i32 175, i32 239, i32 303, i32 48, i32 112, i32 176, i32 240, i32 304, i32 49, i32 113, i32 177, i32 241, i32 305, i32 50, i32 114, i32 178, i32 242, i32 306, i32 51, i32 115, i32 179, i32 243, i32 307, i32 52, i32 116, i32 180, i32 244, i32 308, i32 53, i32 117, i32 181, i32 245, i32 309, i32 54, i32 118, i32 182, i32 246, i32 310, i32 55, i32 119, i32 183, i32 247, i32 311, i32 56, i32 120, i32 184, i32 248, i32 312, i32 57, i32 121, i32 185, i32 249, i32 313, i32 58, i32 122, i32 186, i32 250, i32 314, i32 59, i32 123, i32 187, i32 251, i32 315, i32 60, i32 124, i32 188, i32 252, i32 316, i32 61, i32 125, i32 189, i32 253, i32 317, i32 62, i32 126, i32 190, i32 254, i32 318, i32 63, i32 127, i32 191, i32 255, i32 319> 6045 store <320 x i8> %interleaved.vec, ptr %out.vec, align 64 6046 ret void 6047} 6048