1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i8_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i8_stride3_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 23; SSE-NEXT: pshuflw {{.*#+}} xmm1 = mem[0,0,0,0,4,5,6,7] 24; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255] 25; SSE-NEXT: pxor %xmm3, %xmm3 26; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 27; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,1,4,5,6,7] 28; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 29; SSE-NEXT: packuswb %xmm0, %xmm0 30; SSE-NEXT: pand %xmm2, %xmm0 31; SSE-NEXT: pandn %xmm1, %xmm2 32; SSE-NEXT: por %xmm0, %xmm2 33; SSE-NEXT: movd %xmm2, (%rcx) 34; SSE-NEXT: pextrw $2, %xmm2, %eax 35; SSE-NEXT: movw %ax, 4(%rcx) 36; SSE-NEXT: retq 37; 38; AVX-LABEL: store_i8_stride3_vf2: 39; AVX: # %bb.0: 40; AVX-NEXT: vmovdqa (%rdi), %xmm0 41; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 42; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 43; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 44; AVX-NEXT: vpextrw $2, %xmm0, 4(%rcx) 45; AVX-NEXT: vmovd %xmm0, (%rcx) 46; AVX-NEXT: retq 47; 48; AVX2-LABEL: store_i8_stride3_vf2: 49; AVX2: # %bb.0: 50; AVX2-NEXT: vmovdqa (%rdi), %xmm0 51; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 52; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 53; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 54; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rcx) 55; AVX2-NEXT: vmovd %xmm0, (%rcx) 56; AVX2-NEXT: retq 57; 58; AVX2-FP-LABEL: store_i8_stride3_vf2: 59; AVX2-FP: # %bb.0: 60; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 61; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 62; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 63; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 64; AVX2-FP-NEXT: vpextrw $2, %xmm0, 4(%rcx) 65; AVX2-FP-NEXT: vmovd %xmm0, (%rcx) 66; AVX2-FP-NEXT: retq 67; 68; AVX2-FCP-LABEL: store_i8_stride3_vf2: 69; AVX2-FCP: # %bb.0: 70; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 71; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 72; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 73; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 74; AVX2-FCP-NEXT: vpextrw $2, %xmm0, 4(%rcx) 75; AVX2-FCP-NEXT: vmovd %xmm0, (%rcx) 76; AVX2-FCP-NEXT: retq 77; 78; AVX512-LABEL: store_i8_stride3_vf2: 79; AVX512: # %bb.0: 80; AVX512-NEXT: vmovdqa (%rdi), %xmm0 81; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 82; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 83; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 84; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rcx) 85; AVX512-NEXT: vmovd %xmm0, (%rcx) 86; AVX512-NEXT: retq 87; 88; AVX512-FCP-LABEL: store_i8_stride3_vf2: 89; AVX512-FCP: # %bb.0: 90; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 91; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 92; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 93; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 94; AVX512-FCP-NEXT: vpextrw $2, %xmm0, 4(%rcx) 95; AVX512-FCP-NEXT: vmovd %xmm0, (%rcx) 96; AVX512-FCP-NEXT: retq 97; 98; AVX512DQ-LABEL: store_i8_stride3_vf2: 99; AVX512DQ: # %bb.0: 100; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 101; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 102; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 103; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 104; AVX512DQ-NEXT: vpextrw $2, %xmm0, 4(%rcx) 105; AVX512DQ-NEXT: vmovd %xmm0, (%rcx) 106; AVX512DQ-NEXT: retq 107; 108; AVX512DQ-FCP-LABEL: store_i8_stride3_vf2: 109; AVX512DQ-FCP: # %bb.0: 110; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 111; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 112; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 113; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 114; AVX512DQ-FCP-NEXT: vpextrw $2, %xmm0, 4(%rcx) 115; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rcx) 116; AVX512DQ-FCP-NEXT: retq 117; 118; AVX512BW-LABEL: store_i8_stride3_vf2: 119; AVX512BW: # %bb.0: 120; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 121; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 122; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 123; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 124; AVX512BW-NEXT: vpextrw $2, %xmm0, 4(%rcx) 125; AVX512BW-NEXT: vmovd %xmm0, (%rcx) 126; AVX512BW-NEXT: retq 127; 128; AVX512BW-FCP-LABEL: store_i8_stride3_vf2: 129; AVX512BW-FCP: # %bb.0: 130; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 131; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 132; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 133; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 134; AVX512BW-FCP-NEXT: vpextrw $2, %xmm0, 4(%rcx) 135; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rcx) 136; AVX512BW-FCP-NEXT: retq 137; 138; AVX512DQ-BW-LABEL: store_i8_stride3_vf2: 139; AVX512DQ-BW: # %bb.0: 140; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 141; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 142; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 143; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 144; AVX512DQ-BW-NEXT: vpextrw $2, %xmm0, 4(%rcx) 145; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rcx) 146; AVX512DQ-BW-NEXT: retq 147; 148; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf2: 149; AVX512DQ-BW-FCP: # %bb.0: 150; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 151; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 152; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 153; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,2,6,3,u,u,u,u,u,u,u,u,u,u] 154; AVX512DQ-BW-FCP-NEXT: vpextrw $2, %xmm0, 4(%rcx) 155; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rcx) 156; AVX512DQ-BW-FCP-NEXT: retq 157 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 158 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 159 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 160 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 161 %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 162 %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> 163 %interleaved.vec = shufflevector <6 x i8> %3, <6 x i8> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> 164 store <6 x i8> %interleaved.vec, ptr %out.vec, align 64 165 ret void 166} 167 168define void @store_i8_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { 169; SSE-LABEL: store_i8_stride3_vf4: 170; SSE: # %bb.0: 171; SSE-NEXT: movdqa (%rdi), %xmm0 172; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 173; SSE-NEXT: pxor %xmm1, %xmm1 174; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 175; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 176; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] 177; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 178; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 179; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] 180; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4] 181; SSE-NEXT: packuswb %xmm1, %xmm0 182; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255] 183; SSE-NEXT: pand %xmm1, %xmm0 184; SSE-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,1,1,4,5,6,7] 185; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] 186; SSE-NEXT: pandn %xmm2, %xmm1 187; SSE-NEXT: por %xmm0, %xmm1 188; SSE-NEXT: movq %xmm1, (%rcx) 189; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 190; SSE-NEXT: movd %xmm0, 8(%rcx) 191; SSE-NEXT: retq 192; 193; AVX-LABEL: store_i8_stride3_vf4: 194; AVX: # %bb.0: 195; AVX-NEXT: vmovdqa (%rdi), %xmm0 196; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 197; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 198; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 199; AVX-NEXT: vpextrd $2, %xmm0, 8(%rcx) 200; AVX-NEXT: vmovq %xmm0, (%rcx) 201; AVX-NEXT: retq 202; 203; AVX2-LABEL: store_i8_stride3_vf4: 204; AVX2: # %bb.0: 205; AVX2-NEXT: vmovdqa (%rdi), %xmm0 206; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 207; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 208; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 209; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rcx) 210; AVX2-NEXT: vmovq %xmm0, (%rcx) 211; AVX2-NEXT: retq 212; 213; AVX2-FP-LABEL: store_i8_stride3_vf4: 214; AVX2-FP: # %bb.0: 215; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 216; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 217; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 218; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 219; AVX2-FP-NEXT: vpextrd $2, %xmm0, 8(%rcx) 220; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) 221; AVX2-FP-NEXT: retq 222; 223; AVX2-FCP-LABEL: store_i8_stride3_vf4: 224; AVX2-FCP: # %bb.0: 225; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 226; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 227; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 228; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 229; AVX2-FCP-NEXT: vpextrd $2, %xmm0, 8(%rcx) 230; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) 231; AVX2-FCP-NEXT: retq 232; 233; AVX512-LABEL: store_i8_stride3_vf4: 234; AVX512: # %bb.0: 235; AVX512-NEXT: vmovdqa (%rdi), %xmm0 236; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 237; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 238; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 239; AVX512-NEXT: vpextrd $2, %xmm0, 8(%rcx) 240; AVX512-NEXT: vmovq %xmm0, (%rcx) 241; AVX512-NEXT: retq 242; 243; AVX512-FCP-LABEL: store_i8_stride3_vf4: 244; AVX512-FCP: # %bb.0: 245; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 246; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 247; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 248; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 249; AVX512-FCP-NEXT: vpextrd $2, %xmm0, 8(%rcx) 250; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) 251; AVX512-FCP-NEXT: retq 252; 253; AVX512DQ-LABEL: store_i8_stride3_vf4: 254; AVX512DQ: # %bb.0: 255; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 256; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 257; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 258; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 259; AVX512DQ-NEXT: vpextrd $2, %xmm0, 8(%rcx) 260; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) 261; AVX512DQ-NEXT: retq 262; 263; AVX512DQ-FCP-LABEL: store_i8_stride3_vf4: 264; AVX512DQ-FCP: # %bb.0: 265; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 266; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 267; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 268; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 269; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, 8(%rcx) 270; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) 271; AVX512DQ-FCP-NEXT: retq 272; 273; AVX512BW-LABEL: store_i8_stride3_vf4: 274; AVX512BW: # %bb.0: 275; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 276; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 277; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 278; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 279; AVX512BW-NEXT: vpextrd $2, %xmm0, 8(%rcx) 280; AVX512BW-NEXT: vmovq %xmm0, (%rcx) 281; AVX512BW-NEXT: retq 282; 283; AVX512BW-FCP-LABEL: store_i8_stride3_vf4: 284; AVX512BW-FCP: # %bb.0: 285; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 286; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 287; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 288; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 289; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rcx) 290; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) 291; AVX512BW-FCP-NEXT: retq 292; 293; AVX512DQ-BW-LABEL: store_i8_stride3_vf4: 294; AVX512DQ-BW: # %bb.0: 295; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 296; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 297; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 298; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 299; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 8(%rcx) 300; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) 301; AVX512DQ-BW-NEXT: retq 302; 303; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf4: 304; AVX512DQ-BW-FCP: # %bb.0: 305; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 306; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 307; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 308; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u] 309; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rcx) 310; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) 311; AVX512DQ-BW-FCP-NEXT: retq 312 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 313 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 314 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 315 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 316 %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 317 %3 = shufflevector <8 x i8> %1, <8 x i8> %2, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 318 %interleaved.vec = shufflevector <12 x i8> %3, <12 x i8> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 319 store <12 x i8> %interleaved.vec, ptr %out.vec, align 64 320 ret void 321} 322 323define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { 324; SSE-LABEL: store_i8_stride3_vf8: 325; SSE: # %bb.0: 326; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 327; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 328; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 329; SSE-NEXT: pxor %xmm3, %xmm3 330; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 331; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] 332; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] 333; SSE-NEXT: pand %xmm5, %xmm4 334; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 335; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] 336; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 337; SSE-NEXT: pandn %xmm3, %xmm5 338; SSE-NEXT: por %xmm4, %xmm5 339; SSE-NEXT: movdqa %xmm2, %xmm3 340; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 341; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 342; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] 343; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] 344; SSE-NEXT: packuswb %xmm5, %xmm3 345; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] 346; SSE-NEXT: pand %xmm4, %xmm3 347; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] 348; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] 349; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] 350; SSE-NEXT: pandn %xmm5, %xmm4 351; SSE-NEXT: por %xmm3, %xmm4 352; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 353; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 354; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] 355; SSE-NEXT: packuswb %xmm1, %xmm1 356; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] 357; SSE-NEXT: pand %xmm2, %xmm1 358; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] 359; SSE-NEXT: pandn %xmm0, %xmm2 360; SSE-NEXT: por %xmm1, %xmm2 361; SSE-NEXT: movq %xmm2, 16(%rcx) 362; SSE-NEXT: movdqa %xmm4, (%rcx) 363; SSE-NEXT: retq 364; 365; AVX-LABEL: store_i8_stride3_vf8: 366; AVX: # %bb.0: 367; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 368; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 369; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 370; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 371; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] 372; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero 373; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 374; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] 375; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] 376; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 377; AVX-NEXT: vmovq %xmm0, 16(%rcx) 378; AVX-NEXT: vmovdqa %xmm2, (%rcx) 379; AVX-NEXT: retq 380; 381; AVX2-LABEL: store_i8_stride3_vf8: 382; AVX2: # %bb.0: 383; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 384; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 385; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 386; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 387; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 388; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 389; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 390; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 391; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 392; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 393; AVX2-NEXT: vmovq %xmm1, 16(%rcx) 394; AVX2-NEXT: vmovdqa %xmm0, (%rcx) 395; AVX2-NEXT: vzeroupper 396; AVX2-NEXT: retq 397; 398; AVX2-FP-LABEL: store_i8_stride3_vf8: 399; AVX2-FP: # %bb.0: 400; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 401; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 402; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 403; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 404; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 405; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 406; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 407; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 408; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 409; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 410; AVX2-FP-NEXT: vmovq %xmm1, 16(%rcx) 411; AVX2-FP-NEXT: vmovdqa %xmm0, (%rcx) 412; AVX2-FP-NEXT: vzeroupper 413; AVX2-FP-NEXT: retq 414; 415; AVX2-FCP-LABEL: store_i8_stride3_vf8: 416; AVX2-FCP: # %bb.0: 417; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 418; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 419; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 420; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 421; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 422; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 423; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 424; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 425; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 426; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 427; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rcx) 428; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rcx) 429; AVX2-FCP-NEXT: vzeroupper 430; AVX2-FCP-NEXT: retq 431; 432; AVX512-LABEL: store_i8_stride3_vf8: 433; AVX512: # %bb.0: 434; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 435; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 436; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 437; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 438; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 439; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] 440; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 441; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 442; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 443; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 444; AVX512-NEXT: vmovq %xmm1, 16(%rcx) 445; AVX512-NEXT: vmovdqa %xmm0, (%rcx) 446; AVX512-NEXT: vzeroupper 447; AVX512-NEXT: retq 448; 449; AVX512-FCP-LABEL: store_i8_stride3_vf8: 450; AVX512-FCP: # %bb.0: 451; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 452; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 453; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 454; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 455; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 456; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] 457; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 458; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 459; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 460; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 461; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rcx) 462; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rcx) 463; AVX512-FCP-NEXT: vzeroupper 464; AVX512-FCP-NEXT: retq 465; 466; AVX512DQ-LABEL: store_i8_stride3_vf8: 467; AVX512DQ: # %bb.0: 468; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 469; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 470; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 471; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 472; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 473; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] 474; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 475; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 476; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 477; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 478; AVX512DQ-NEXT: vmovq %xmm1, 16(%rcx) 479; AVX512DQ-NEXT: vmovdqa %xmm0, (%rcx) 480; AVX512DQ-NEXT: vzeroupper 481; AVX512DQ-NEXT: retq 482; 483; AVX512DQ-FCP-LABEL: store_i8_stride3_vf8: 484; AVX512DQ-FCP: # %bb.0: 485; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 486; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 487; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 488; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 489; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 490; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] 491; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 492; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 493; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 494; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 495; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rcx) 496; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rcx) 497; AVX512DQ-FCP-NEXT: vzeroupper 498; AVX512DQ-FCP-NEXT: retq 499; 500; AVX512BW-LABEL: store_i8_stride3_vf8: 501; AVX512BW: # %bb.0: 502; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 503; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 504; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 505; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 506; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 507; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 508; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 509; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 510; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 511; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 512; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx) 513; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx) 514; AVX512BW-NEXT: vzeroupper 515; AVX512BW-NEXT: retq 516; 517; AVX512BW-FCP-LABEL: store_i8_stride3_vf8: 518; AVX512BW-FCP: # %bb.0: 519; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 520; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 521; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 522; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 523; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 524; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 525; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 526; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 527; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 528; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 529; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx) 530; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rcx) 531; AVX512BW-FCP-NEXT: vzeroupper 532; AVX512BW-FCP-NEXT: retq 533; 534; AVX512DQ-BW-LABEL: store_i8_stride3_vf8: 535; AVX512DQ-BW: # %bb.0: 536; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 537; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 538; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 539; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 540; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 541; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 542; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 543; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 544; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 545; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 546; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx) 547; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rcx) 548; AVX512DQ-BW-NEXT: vzeroupper 549; AVX512DQ-BW-NEXT: retq 550; 551; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf8: 552; AVX512DQ-BW-FCP: # %bb.0: 553; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 554; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 555; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 556; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 557; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 558; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero 559; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 560; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero 561; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 562; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 563; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx) 564; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rcx) 565; AVX512DQ-BW-FCP-NEXT: vzeroupper 566; AVX512DQ-BW-FCP-NEXT: retq 567 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 568 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64 569 %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64 570 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 571 %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 572 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 573 %interleaved.vec = shufflevector <24 x i8> %3, <24 x i8> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 574 store <24 x i8> %interleaved.vec, ptr %out.vec, align 64 575 ret void 576} 577 578define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { 579; SSE-LABEL: store_i8_stride3_vf16: 580; SSE: # %bb.0: 581; SSE-NEXT: movdqa (%rdi), %xmm2 582; SSE-NEXT: movdqa (%rsi), %xmm4 583; SSE-NEXT: movdqa (%rdx), %xmm1 584; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] 585; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] 586; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] 587; SSE-NEXT: movdqa %xmm0, %xmm5 588; SSE-NEXT: pandn %xmm3, %xmm5 589; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] 590; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 591; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 592; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 593; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] 594; SSE-NEXT: pand %xmm0, %xmm6 595; SSE-NEXT: por %xmm5, %xmm6 596; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] 597; SSE-NEXT: pand %xmm5, %xmm6 598; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] 599; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,4,4,4] 600; SSE-NEXT: movdqa %xmm5, %xmm3 601; SSE-NEXT: pandn %xmm7, %xmm3 602; SSE-NEXT: por %xmm6, %xmm3 603; SSE-NEXT: movdqa %xmm4, %xmm6 604; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 605; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,2] 606; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,2,4,5,6,7] 607; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] 608; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] 609; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,6,6] 610; SSE-NEXT: pand %xmm5, %xmm7 611; SSE-NEXT: pandn %xmm6, %xmm5 612; SSE-NEXT: por %xmm7, %xmm5 613; SSE-NEXT: pand %xmm0, %xmm5 614; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,1] 615; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] 616; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,6,6] 617; SSE-NEXT: movdqa %xmm0, %xmm7 618; SSE-NEXT: pandn %xmm6, %xmm7 619; SSE-NEXT: por %xmm5, %xmm7 620; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 621; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] 622; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,6,7] 623; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 624; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] 625; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 626; SSE-NEXT: pand %xmm0, %xmm2 627; SSE-NEXT: pandn %xmm4, %xmm0 628; SSE-NEXT: por %xmm2, %xmm0 629; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 630; SSE-NEXT: pand %xmm2, %xmm0 631; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 632; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] 633; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] 634; SSE-NEXT: pandn %xmm1, %xmm2 635; SSE-NEXT: por %xmm0, %xmm2 636; SSE-NEXT: movdqa %xmm2, 32(%rcx) 637; SSE-NEXT: movdqa %xmm7, (%rcx) 638; SSE-NEXT: movdqa %xmm3, 16(%rcx) 639; SSE-NEXT: retq 640; 641; AVX-LABEL: store_i8_stride3_vf16: 642; AVX: # %bb.0: 643; AVX-NEXT: vmovdqa (%rdi), %xmm0 644; AVX-NEXT: vmovdqa (%rsi), %xmm1 645; AVX-NEXT: vmovdqa (%rdx), %xmm2 646; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 647; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 648; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 649; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 650; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 651; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 652; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 653; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 654; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 655; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 656; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 657; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 658; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 659; AVX-NEXT: vmovdqa %xmm1, (%rcx) 660; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) 661; AVX-NEXT: retq 662; 663; AVX2-LABEL: store_i8_stride3_vf16: 664; AVX2: # %bb.0: 665; AVX2-NEXT: vmovdqa (%rdi), %xmm0 666; AVX2-NEXT: vmovdqa (%rsi), %xmm1 667; AVX2-NEXT: vmovdqa (%rdx), %xmm2 668; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 669; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 670; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 671; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 672; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 673; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 674; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 675; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 676; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 677; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 678; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 679; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 680; AVX2-NEXT: vmovdqa %xmm0, 16(%rcx) 681; AVX2-NEXT: vmovdqa %xmm1, (%rcx) 682; AVX2-NEXT: vmovdqa %xmm2, 32(%rcx) 683; AVX2-NEXT: retq 684; 685; AVX2-FP-LABEL: store_i8_stride3_vf16: 686; AVX2-FP: # %bb.0: 687; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 688; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 689; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 690; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 691; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 692; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 693; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 694; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 695; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 696; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 697; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 698; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 699; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 700; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 701; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 702; AVX2-FP-NEXT: vmovdqa %xmm0, 16(%rcx) 703; AVX2-FP-NEXT: vmovdqa %xmm1, (%rcx) 704; AVX2-FP-NEXT: vmovdqa %xmm2, 32(%rcx) 705; AVX2-FP-NEXT: retq 706; 707; AVX2-FCP-LABEL: store_i8_stride3_vf16: 708; AVX2-FCP: # %bb.0: 709; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 710; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 711; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 712; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 713; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 714; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 715; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 716; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 717; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 718; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 719; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 720; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 721; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 722; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 723; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 724; AVX2-FCP-NEXT: vmovdqa %xmm0, 16(%rcx) 725; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rcx) 726; AVX2-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) 727; AVX2-FCP-NEXT: retq 728; 729; AVX512-LABEL: store_i8_stride3_vf16: 730; AVX512: # %bb.0: 731; AVX512-NEXT: vmovdqa (%rdi), %xmm0 732; AVX512-NEXT: vmovdqa (%rsi), %xmm1 733; AVX512-NEXT: vmovdqa (%rdx), %xmm2 734; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 735; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 736; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 737; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 738; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 739; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 740; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 741; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 742; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 743; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 744; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 745; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 746; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 747; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx) 748; AVX512-NEXT: vmovdqa %ymm0, (%rcx) 749; AVX512-NEXT: vzeroupper 750; AVX512-NEXT: retq 751; 752; AVX512-FCP-LABEL: store_i8_stride3_vf16: 753; AVX512-FCP: # %bb.0: 754; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 755; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 756; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 757; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 758; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 759; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 760; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 761; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 762; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 763; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 764; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 765; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 766; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 767; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 768; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 769; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 770; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) 771; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx) 772; AVX512-FCP-NEXT: vzeroupper 773; AVX512-FCP-NEXT: retq 774; 775; AVX512DQ-LABEL: store_i8_stride3_vf16: 776; AVX512DQ: # %bb.0: 777; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 778; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 779; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 780; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 781; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 782; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 783; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 784; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 785; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 786; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 787; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 788; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 789; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 790; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 791; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 792; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 793; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx) 794; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) 795; AVX512DQ-NEXT: vzeroupper 796; AVX512DQ-NEXT: retq 797; 798; AVX512DQ-FCP-LABEL: store_i8_stride3_vf16: 799; AVX512DQ-FCP: # %bb.0: 800; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 801; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 802; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 803; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 804; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 805; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 806; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 807; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 808; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 809; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 810; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 811; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 812; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 813; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 814; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 815; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 816; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) 817; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx) 818; AVX512DQ-FCP-NEXT: vzeroupper 819; AVX512DQ-FCP-NEXT: retq 820; 821; AVX512BW-LABEL: store_i8_stride3_vf16: 822; AVX512BW: # %bb.0: 823; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 824; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 825; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 826; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 827; AVX512BW-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 828; AVX512BW-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 829; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 830; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 831; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 832; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 833; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 834; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 835; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 836; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 837; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 838; AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 839; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx) 840; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx) 841; AVX512BW-NEXT: vzeroupper 842; AVX512BW-NEXT: retq 843; 844; AVX512BW-FCP-LABEL: store_i8_stride3_vf16: 845; AVX512BW-FCP: # %bb.0: 846; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 847; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 848; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 849; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 850; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 851; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 852; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 853; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 854; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 855; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 856; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 857; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 858; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 859; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 860; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 861; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 862; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) 863; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) 864; AVX512BW-FCP-NEXT: vzeroupper 865; AVX512BW-FCP-NEXT: retq 866; 867; AVX512DQ-BW-LABEL: store_i8_stride3_vf16: 868; AVX512DQ-BW: # %bb.0: 869; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 870; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 871; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 872; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 873; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 874; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 875; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 876; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 877; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 878; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 879; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 880; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 881; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 882; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 883; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 884; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 885; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx) 886; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx) 887; AVX512DQ-BW-NEXT: vzeroupper 888; AVX512DQ-BW-NEXT: retq 889; 890; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf16: 891; AVX512DQ-BW-FCP: # %bb.0: 892; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 893; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 894; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 895; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 896; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 897; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 898; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 899; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 900; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 901; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 902; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 903; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 904; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 905; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 906; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 907; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 908; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) 909; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) 910; AVX512DQ-BW-FCP-NEXT: vzeroupper 911; AVX512DQ-BW-FCP-NEXT: retq 912 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 913 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 914 %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64 915 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 916 %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 917 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 918 %interleaved.vec = shufflevector <48 x i8> %3, <48 x i8> poison, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 919 store <48 x i8> %interleaved.vec, ptr %out.vec, align 64 920 ret void 921} 922 923define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { 924; SSE-LABEL: store_i8_stride3_vf32: 925; SSE: # %bb.0: 926; SSE-NEXT: movdqa (%rdi), %xmm2 927; SSE-NEXT: movdqa 16(%rdi), %xmm8 928; SSE-NEXT: movdqa (%rsi), %xmm4 929; SSE-NEXT: movdqa 16(%rsi), %xmm10 930; SSE-NEXT: movdqa (%rdx), %xmm1 931; SSE-NEXT: movdqa 16(%rdx), %xmm7 932; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] 933; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] 934; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] 935; SSE-NEXT: movdqa %xmm0, %xmm5 936; SSE-NEXT: pandn %xmm3, %xmm5 937; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] 938; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 939; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 940; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 941; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] 942; SSE-NEXT: pand %xmm0, %xmm6 943; SSE-NEXT: por %xmm5, %xmm6 944; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] 945; SSE-NEXT: pand %xmm5, %xmm6 946; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] 947; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,4,4] 948; SSE-NEXT: movdqa %xmm5, %xmm3 949; SSE-NEXT: pandn %xmm9, %xmm3 950; SSE-NEXT: por %xmm6, %xmm3 951; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] 952; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] 953; SSE-NEXT: movdqa %xmm0, %xmm9 954; SSE-NEXT: pandn %xmm6, %xmm9 955; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] 956; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 957; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 958; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] 959; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,5,5,6] 960; SSE-NEXT: pand %xmm0, %xmm11 961; SSE-NEXT: por %xmm9, %xmm11 962; SSE-NEXT: pand %xmm5, %xmm11 963; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] 964; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,4,4] 965; SSE-NEXT: movdqa %xmm5, %xmm6 966; SSE-NEXT: pandn %xmm9, %xmm6 967; SSE-NEXT: por %xmm11, %xmm6 968; SSE-NEXT: movdqa %xmm10, %xmm9 969; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] 970; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,2,2,3] 971; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7] 972; SSE-NEXT: movdqa %xmm0, %xmm11 973; SSE-NEXT: pandn %xmm9, %xmm11 974; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] 975; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] 976; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,7,7,7,7] 977; SSE-NEXT: pand %xmm0, %xmm12 978; SSE-NEXT: por %xmm11, %xmm12 979; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 980; SSE-NEXT: pand %xmm9, %xmm12 981; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] 982; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,1,2,2,4,5,6,7] 983; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] 984; SSE-NEXT: movdqa %xmm9, %xmm11 985; SSE-NEXT: pandn %xmm13, %xmm11 986; SSE-NEXT: por %xmm12, %xmm11 987; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 988; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,2] 989; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,2,4,5,6,7] 990; SSE-NEXT: movdqa %xmm5, %xmm12 991; SSE-NEXT: pandn %xmm10, %xmm12 992; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] 993; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7] 994; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,6,6] 995; SSE-NEXT: pand %xmm5, %xmm8 996; SSE-NEXT: por %xmm12, %xmm8 997; SSE-NEXT: pand %xmm0, %xmm8 998; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] 999; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] 1000; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,6,6] 1001; SSE-NEXT: movdqa %xmm0, %xmm7 1002; SSE-NEXT: pandn %xmm10, %xmm7 1003; SSE-NEXT: por %xmm8, %xmm7 1004; SSE-NEXT: movdqa %xmm4, %xmm8 1005; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 1006; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,2,2,3] 1007; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,6,7] 1008; SSE-NEXT: movdqa %xmm0, %xmm10 1009; SSE-NEXT: pandn %xmm8, %xmm10 1010; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] 1011; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] 1012; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] 1013; SSE-NEXT: pand %xmm0, %xmm8 1014; SSE-NEXT: por %xmm10, %xmm8 1015; SSE-NEXT: pand %xmm9, %xmm8 1016; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] 1017; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7] 1018; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] 1019; SSE-NEXT: pandn %xmm10, %xmm9 1020; SSE-NEXT: por %xmm8, %xmm9 1021; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1022; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] 1023; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] 1024; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 1025; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] 1026; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6] 1027; SSE-NEXT: pand %xmm5, %xmm2 1028; SSE-NEXT: pandn %xmm4, %xmm5 1029; SSE-NEXT: por %xmm2, %xmm5 1030; SSE-NEXT: pand %xmm0, %xmm5 1031; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 1032; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1033; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6] 1034; SSE-NEXT: pandn %xmm1, %xmm0 1035; SSE-NEXT: por %xmm5, %xmm0 1036; SSE-NEXT: movdqa %xmm0, (%rcx) 1037; SSE-NEXT: movdqa %xmm9, 32(%rcx) 1038; SSE-NEXT: movdqa %xmm7, 48(%rcx) 1039; SSE-NEXT: movdqa %xmm11, 80(%rcx) 1040; SSE-NEXT: movdqa %xmm6, 16(%rcx) 1041; SSE-NEXT: movdqa %xmm3, 64(%rcx) 1042; SSE-NEXT: retq 1043; 1044; AVX-LABEL: store_i8_stride3_vf32: 1045; AVX: # %bb.0: 1046; AVX-NEXT: vmovdqa (%rdi), %xmm0 1047; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 1048; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1049; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1050; AVX-NEXT: vmovdqa (%rsi), %xmm2 1051; AVX-NEXT: vmovdqa 16(%rsi), %xmm3 1052; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1053; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1054; AVX-NEXT: vmovdqa (%rdx), %xmm6 1055; AVX-NEXT: vmovdqa 16(%rdx), %xmm7 1056; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] 1057; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] 1058; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1059; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1060; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] 1061; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 1062; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 1063; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1064; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 1065; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] 1066; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] 1067; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] 1068; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1069; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 1070; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 1071; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 1072; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 1073; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4 1074; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 1075; AVX-NEXT: vmovdqa %xmm1, 64(%rcx) 1076; AVX-NEXT: vmovdqa %xmm4, 80(%rcx) 1077; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) 1078; AVX-NEXT: vmovdqa %xmm5, 32(%rcx) 1079; AVX-NEXT: vmovdqa %xmm2, (%rcx) 1080; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) 1081; AVX-NEXT: retq 1082; 1083; AVX2-LABEL: store_i8_stride3_vf32: 1084; AVX2: # %bb.0: 1085; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1086; AVX2-NEXT: vmovdqa (%rsi), %ymm1 1087; AVX2-NEXT: vmovdqa (%rdx), %ymm2 1088; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1089; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1090; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1091; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1092; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1093; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1094; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1095; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1096; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1097; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1098; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1099; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1100; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1101; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1102; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1103; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1104; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) 1105; AVX2-NEXT: vmovdqa %ymm0, 64(%rcx) 1106; AVX2-NEXT: vmovdqa %ymm3, (%rcx) 1107; AVX2-NEXT: vzeroupper 1108; AVX2-NEXT: retq 1109; 1110; AVX2-FP-LABEL: store_i8_stride3_vf32: 1111; AVX2-FP: # %bb.0: 1112; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1113; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 1114; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 1115; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1116; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1117; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1118; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1119; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1120; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1121; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1122; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1123; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1124; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1125; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] 1126; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1127; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1128; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1129; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1130; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1131; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rcx) 1132; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rcx) 1133; AVX2-FP-NEXT: vmovdqa %ymm3, (%rcx) 1134; AVX2-FP-NEXT: vzeroupper 1135; AVX2-FP-NEXT: retq 1136; 1137; AVX2-FCP-LABEL: store_i8_stride3_vf32: 1138; AVX2-FCP: # %bb.0: 1139; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1140; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 1141; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm2 1142; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1143; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1144; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1145; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1146; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1147; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1148; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1149; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1150; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1151; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1152; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1153; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1154; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1155; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1156; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1157; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1158; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rcx) 1159; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 1160; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rcx) 1161; AVX2-FCP-NEXT: vzeroupper 1162; AVX2-FCP-NEXT: retq 1163; 1164; AVX512-LABEL: store_i8_stride3_vf32: 1165; AVX512: # %bb.0: 1166; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1167; AVX512-NEXT: vmovdqa (%rsi), %ymm1 1168; AVX512-NEXT: vmovdqa (%rdx), %ymm2 1169; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1170; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1171; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1172; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1173; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1174; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1175; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1176; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1177; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1178; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1179; AVX512-NEXT: # ymm4 = mem[0,1,0,1] 1180; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1181; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1182; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1183; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1184; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1185; AVX512-NEXT: vmovdqa %ymm1, 32(%rcx) 1186; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx) 1187; AVX512-NEXT: vmovdqa %ymm3, (%rcx) 1188; AVX512-NEXT: vzeroupper 1189; AVX512-NEXT: retq 1190; 1191; AVX512-FCP-LABEL: store_i8_stride3_vf32: 1192; AVX512-FCP: # %bb.0: 1193; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1194; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 1195; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 1196; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1197; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1198; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1199; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1200; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1201; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1202; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1203; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1204; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1205; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1206; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1207; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1208; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1209; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1210; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1211; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1212; AVX512-FCP-NEXT: vmovdqa %ymm1, 32(%rcx) 1213; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 1214; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rcx) 1215; AVX512-FCP-NEXT: vzeroupper 1216; AVX512-FCP-NEXT: retq 1217; 1218; AVX512DQ-LABEL: store_i8_stride3_vf32: 1219; AVX512DQ: # %bb.0: 1220; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1221; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 1222; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 1223; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1224; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1225; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1226; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1227; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1228; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1229; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1230; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1231; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1232; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1233; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] 1234; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1235; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1236; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1237; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1238; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1239; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) 1240; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rcx) 1241; AVX512DQ-NEXT: vmovdqa %ymm3, (%rcx) 1242; AVX512DQ-NEXT: vzeroupper 1243; AVX512DQ-NEXT: retq 1244; 1245; AVX512DQ-FCP-LABEL: store_i8_stride3_vf32: 1246; AVX512DQ-FCP: # %bb.0: 1247; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1248; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 1249; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 1250; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1251; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1252; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1253; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1254; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1255; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1256; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1257; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1258; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1259; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1260; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 1261; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1262; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1263; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 1264; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1265; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1266; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 32(%rcx) 1267; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 1268; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rcx) 1269; AVX512DQ-FCP-NEXT: vzeroupper 1270; AVX512DQ-FCP-NEXT: retq 1271; 1272; AVX512BW-LABEL: store_i8_stride3_vf32: 1273; AVX512BW: # %bb.0: 1274; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1275; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1 1276; AVX512BW-NEXT: vmovdqa (%rdx), %ymm2 1277; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1278; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1279; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1280; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1281; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1282; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1283; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1284; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1285; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1286; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1287; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1288; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1289; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1290; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1291; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1292; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 1293; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rcx) 1294; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx) 1295; AVX512BW-NEXT: vzeroupper 1296; AVX512BW-NEXT: retq 1297; 1298; AVX512BW-FCP-LABEL: store_i8_stride3_vf32: 1299; AVX512BW-FCP: # %bb.0: 1300; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1301; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm1 1302; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm2 1303; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1304; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1305; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1306; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1307; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1308; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1309; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1310; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1311; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1312; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1313; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1314; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1315; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1316; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1317; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1318; AVX512BW-FCP-NEXT: vpshufb %zmm2, %zmm1, %zmm1 1319; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 1320; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) 1321; AVX512BW-FCP-NEXT: vzeroupper 1322; AVX512BW-FCP-NEXT: retq 1323; 1324; AVX512DQ-BW-LABEL: store_i8_stride3_vf32: 1325; AVX512DQ-BW: # %bb.0: 1326; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 1327; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm1 1328; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm2 1329; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1330; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1331; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1332; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1333; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1334; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1335; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1336; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1337; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1338; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1339; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1340; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1341; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1342; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1343; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1344; AVX512DQ-BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 1345; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rcx) 1346; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rcx) 1347; AVX512DQ-BW-NEXT: vzeroupper 1348; AVX512DQ-BW-NEXT: retq 1349; 1350; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf32: 1351; AVX512DQ-BW-FCP: # %bb.0: 1352; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1353; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm1 1354; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm2 1355; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1356; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1357; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1358; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1359; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1360; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1361; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1362; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1363; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 1364; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1365; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1366; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1367; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1368; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1369; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1370; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm2, %zmm1, %zmm1 1371; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 1372; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) 1373; AVX512DQ-BW-FCP-NEXT: vzeroupper 1374; AVX512DQ-BW-FCP-NEXT: retq 1375 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 1376 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64 1377 %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64 1378 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1379 %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1380 %3 = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95> 1381 %interleaved.vec = shufflevector <96 x i8> %3, <96 x i8> poison, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95> 1382 store <96 x i8> %interleaved.vec, ptr %out.vec, align 64 1383 ret void 1384} 1385 1386define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { 1387; SSE-LABEL: store_i8_stride3_vf64: 1388; SSE: # %bb.0: 1389; SSE-NEXT: movdqa 16(%rdi), %xmm10 1390; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1391; SSE-NEXT: movdqa 32(%rdi), %xmm12 1392; SSE-NEXT: movdqa 48(%rdi), %xmm5 1393; SSE-NEXT: movdqa 16(%rsi), %xmm9 1394; SSE-NEXT: movdqa 32(%rsi), %xmm14 1395; SSE-NEXT: movdqa 48(%rsi), %xmm4 1396; SSE-NEXT: movdqa 16(%rdx), %xmm8 1397; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1398; SSE-NEXT: movdqa 32(%rdx), %xmm11 1399; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1400; SSE-NEXT: movdqa 48(%rdx), %xmm7 1401; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] 1402; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] 1403; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] 1404; SSE-NEXT: movdqa %xmm1, %xmm3 1405; SSE-NEXT: pandn %xmm2, %xmm3 1406; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] 1407; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1408; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1409; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 1410; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,6] 1411; SSE-NEXT: pand %xmm1, %xmm6 1412; SSE-NEXT: por %xmm3, %xmm6 1413; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] 1414; SSE-NEXT: pand %xmm2, %xmm6 1415; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] 1416; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] 1417; SSE-NEXT: movdqa %xmm2, %xmm0 1418; SSE-NEXT: pandn %xmm3, %xmm0 1419; SSE-NEXT: por %xmm6, %xmm0 1420; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1421; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] 1422; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] 1423; SSE-NEXT: movdqa %xmm1, %xmm6 1424; SSE-NEXT: pandn %xmm3, %xmm6 1425; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,2,3] 1426; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1427; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1428; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 1429; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] 1430; SSE-NEXT: pand %xmm1, %xmm3 1431; SSE-NEXT: por %xmm6, %xmm3 1432; SSE-NEXT: pand %xmm2, %xmm3 1433; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] 1434; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] 1435; SSE-NEXT: movdqa %xmm2, %xmm0 1436; SSE-NEXT: pandn %xmm6, %xmm0 1437; SSE-NEXT: por %xmm3, %xmm0 1438; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1439; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] 1440; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] 1441; SSE-NEXT: movdqa %xmm1, %xmm6 1442; SSE-NEXT: pandn %xmm3, %xmm6 1443; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] 1444; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1445; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1446; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 1447; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] 1448; SSE-NEXT: pand %xmm1, %xmm3 1449; SSE-NEXT: por %xmm6, %xmm3 1450; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] 1451; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,4,4,4] 1452; SSE-NEXT: movdqa %xmm2, %xmm13 1453; SSE-NEXT: pandn %xmm8, %xmm13 1454; SSE-NEXT: movdqa (%rdi), %xmm0 1455; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1456; SSE-NEXT: pand %xmm2, %xmm3 1457; SSE-NEXT: por %xmm3, %xmm13 1458; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] 1459; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] 1460; SSE-NEXT: movdqa %xmm1, %xmm8 1461; SSE-NEXT: pandn %xmm3, %xmm8 1462; SSE-NEXT: movdqa (%rsi), %xmm11 1463; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] 1464; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1465; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1466; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 1467; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] 1468; SSE-NEXT: pand %xmm1, %xmm3 1469; SSE-NEXT: por %xmm8, %xmm3 1470; SSE-NEXT: movdqa (%rdx), %xmm0 1471; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1472; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] 1473; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,4,4,4] 1474; SSE-NEXT: movdqa %xmm2, %xmm10 1475; SSE-NEXT: pandn %xmm15, %xmm10 1476; SSE-NEXT: pand %xmm2, %xmm3 1477; SSE-NEXT: por %xmm3, %xmm10 1478; SSE-NEXT: movdqa %xmm4, %xmm3 1479; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 1480; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 1481; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] 1482; SSE-NEXT: movdqa %xmm1, %xmm15 1483; SSE-NEXT: pandn %xmm3, %xmm15 1484; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 1485; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] 1486; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] 1487; SSE-NEXT: pand %xmm1, %xmm0 1488; SSE-NEXT: por %xmm15, %xmm0 1489; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] 1490; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] 1491; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] 1492; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] 1493; SSE-NEXT: movdqa %xmm15, %xmm8 1494; SSE-NEXT: pandn %xmm3, %xmm8 1495; SSE-NEXT: pand %xmm15, %xmm0 1496; SSE-NEXT: por %xmm0, %xmm8 1497; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1498; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,1,2] 1499; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 1500; SSE-NEXT: movdqa %xmm2, %xmm3 1501; SSE-NEXT: pandn %xmm0, %xmm3 1502; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] 1503; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1504; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,5,6,6] 1505; SSE-NEXT: pand %xmm2, %xmm4 1506; SSE-NEXT: por %xmm3, %xmm4 1507; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] 1508; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1509; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,5,6,6] 1510; SSE-NEXT: movdqa %xmm1, %xmm6 1511; SSE-NEXT: pandn %xmm3, %xmm6 1512; SSE-NEXT: pand %xmm1, %xmm4 1513; SSE-NEXT: por %xmm4, %xmm6 1514; SSE-NEXT: movdqa %xmm14, %xmm3 1515; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] 1516; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 1517; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] 1518; SSE-NEXT: movdqa %xmm1, %xmm4 1519; SSE-NEXT: pandn %xmm3, %xmm4 1520; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] 1521; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] 1522; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] 1523; SSE-NEXT: pand %xmm1, %xmm3 1524; SSE-NEXT: por %xmm4, %xmm3 1525; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1526; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1527; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] 1528; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] 1529; SSE-NEXT: movdqa %xmm15, %xmm4 1530; SSE-NEXT: pandn %xmm5, %xmm4 1531; SSE-NEXT: pand %xmm15, %xmm3 1532; SSE-NEXT: por %xmm3, %xmm4 1533; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1534; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,1,2] 1535; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] 1536; SSE-NEXT: movdqa %xmm2, %xmm5 1537; SSE-NEXT: pandn %xmm3, %xmm5 1538; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] 1539; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] 1540; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] 1541; SSE-NEXT: pand %xmm2, %xmm3 1542; SSE-NEXT: por %xmm5, %xmm3 1543; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] 1544; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] 1545; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,6,6] 1546; SSE-NEXT: movdqa %xmm1, %xmm5 1547; SSE-NEXT: pandn %xmm7, %xmm5 1548; SSE-NEXT: pand %xmm1, %xmm3 1549; SSE-NEXT: por %xmm3, %xmm5 1550; SSE-NEXT: movdqa %xmm9, %xmm3 1551; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 1552; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 1553; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] 1554; SSE-NEXT: movdqa %xmm1, %xmm7 1555; SSE-NEXT: pandn %xmm3, %xmm7 1556; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1557; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1558; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] 1559; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] 1560; SSE-NEXT: pand %xmm1, %xmm3 1561; SSE-NEXT: por %xmm7, %xmm3 1562; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 1563; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] 1564; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] 1565; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] 1566; SSE-NEXT: movdqa %xmm15, %xmm12 1567; SSE-NEXT: pandn %xmm7, %xmm12 1568; SSE-NEXT: pand %xmm15, %xmm3 1569; SSE-NEXT: por %xmm3, %xmm12 1570; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1571; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,2] 1572; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] 1573; SSE-NEXT: movdqa %xmm2, %xmm7 1574; SSE-NEXT: pandn %xmm3, %xmm7 1575; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] 1576; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] 1577; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] 1578; SSE-NEXT: pand %xmm2, %xmm3 1579; SSE-NEXT: por %xmm7, %xmm3 1580; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,1] 1581; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] 1582; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] 1583; SSE-NEXT: movdqa %xmm1, %xmm7 1584; SSE-NEXT: pandn %xmm9, %xmm7 1585; SSE-NEXT: pand %xmm1, %xmm3 1586; SSE-NEXT: por %xmm3, %xmm7 1587; SSE-NEXT: movdqa %xmm11, %xmm3 1588; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] 1589; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] 1590; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] 1591; SSE-NEXT: movdqa %xmm1, %xmm9 1592; SSE-NEXT: pandn %xmm3, %xmm9 1593; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1594; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1595; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] 1596; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] 1597; SSE-NEXT: pand %xmm1, %xmm3 1598; SSE-NEXT: por %xmm9, %xmm3 1599; SSE-NEXT: pand %xmm15, %xmm3 1600; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 1601; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] 1602; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] 1603; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] 1604; SSE-NEXT: pandn %xmm9, %xmm15 1605; SSE-NEXT: por %xmm3, %xmm15 1606; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1607; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,1,2] 1608; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] 1609; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] 1610; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] 1611; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,6,6] 1612; SSE-NEXT: pand %xmm2, %xmm9 1613; SSE-NEXT: pandn %xmm3, %xmm2 1614; SSE-NEXT: por %xmm9, %xmm2 1615; SSE-NEXT: pand %xmm1, %xmm2 1616; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,1] 1617; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] 1618; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] 1619; SSE-NEXT: pandn %xmm3, %xmm1 1620; SSE-NEXT: por %xmm2, %xmm1 1621; SSE-NEXT: movdqa %xmm1, (%rcx) 1622; SSE-NEXT: movdqa %xmm15, 32(%rcx) 1623; SSE-NEXT: movdqa %xmm7, 48(%rcx) 1624; SSE-NEXT: movdqa %xmm12, 80(%rcx) 1625; SSE-NEXT: movdqa %xmm5, 96(%rcx) 1626; SSE-NEXT: movdqa %xmm4, 128(%rcx) 1627; SSE-NEXT: movdqa %xmm6, 144(%rcx) 1628; SSE-NEXT: movdqa %xmm8, 176(%rcx) 1629; SSE-NEXT: movdqa %xmm10, 16(%rcx) 1630; SSE-NEXT: movdqa %xmm13, 64(%rcx) 1631; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1632; SSE-NEXT: movaps %xmm0, 112(%rcx) 1633; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1634; SSE-NEXT: movaps %xmm0, 160(%rcx) 1635; SSE-NEXT: retq 1636; 1637; AVX-LABEL: store_i8_stride3_vf64: 1638; AVX: # %bb.0: 1639; AVX-NEXT: subq $24, %rsp 1640; AVX-NEXT: vmovdqa (%rdi), %xmm7 1641; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 1642; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 1643; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 1644; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] 1645; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm0 1646; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1647; AVX-NEXT: vmovdqa (%rdx), %xmm3 1648; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 1649; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm0 1650; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1651; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm0 1652; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1653; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 1654; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10] 1655; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2 1656; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128] 1657; AVX-NEXT: vmovdqa 16(%rsi), %xmm12 1658; AVX-NEXT: vmovdqa 32(%rsi), %xmm13 1659; AVX-NEXT: vmovdqa 48(%rsi), %xmm14 1660; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm15 1661; AVX-NEXT: vpor %xmm2, %xmm15, %xmm0 1662; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1663; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm6 1664; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm15 1665; AVX-NEXT: vpor %xmm6, %xmm15, %xmm0 1666; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1667; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9 1668; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm15 1669; AVX-NEXT: vpor %xmm9, %xmm15, %xmm0 1670; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1671; AVX-NEXT: vmovdqa (%rsi), %xmm15 1672; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7 1673; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm10 1674; AVX-NEXT: vpor %xmm7, %xmm10, %xmm0 1675; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1676; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] 1677; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] 1678; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm6 1679; AVX-NEXT: vmovdqa %xmm1, %xmm0 1680; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] 1681; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5 1682; AVX-NEXT: vmovdqa 32(%rdx), %xmm11 1683; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] 1684; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm4 1685; AVX-NEXT: vmovdqa 48(%rdx), %xmm7 1686; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] 1687; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9 1688; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] 1689; AVX-NEXT: vpor %xmm10, %xmm8, %xmm10 1690; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] 1691; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1692; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4] 1693; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1694; AVX-NEXT: vpor %xmm2, %xmm14, %xmm14 1695; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] 1696; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] 1697; AVX-NEXT: vmovdqa %xmm0, %xmm10 1698; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1699; AVX-NEXT: vpor %xmm1, %xmm14, %xmm14 1700; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] 1701; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] 1702; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1703; AVX-NEXT: vpor %xmm0, %xmm14, %xmm14 1704; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] 1705; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] 1706; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1707; AVX-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1708; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1709; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1710; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] 1711; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1712; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 1713; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] 1714; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm1 1715; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 1716; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1717; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 1718; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm3 1719; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 1720; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1721; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 1722; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm11 1723; AVX-NEXT: vpor %xmm6, %xmm11, %xmm6 1724; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1725; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm14 1726; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4 1727; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12 1728; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 1729; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm13 1730; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm0 1731; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1732; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm10 1733; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1734; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm8 1735; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload 1736; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm5 1737; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7 1738; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 1739; AVX-NEXT: vmovdqa %xmm3, 64(%rcx) 1740; AVX-NEXT: vmovdqa %xmm2, 80(%rcx) 1741; AVX-NEXT: vmovdqa %xmm14, (%rcx) 1742; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) 1743; AVX-NEXT: vmovdqa %xmm4, 32(%rcx) 1744; AVX-NEXT: vmovdqa %xmm12, 48(%rcx) 1745; AVX-NEXT: vmovdqa %xmm5, 160(%rcx) 1746; AVX-NEXT: vmovdqa %xmm8, 176(%rcx) 1747; AVX-NEXT: vmovdqa %xmm13, 96(%rcx) 1748; AVX-NEXT: vmovdqa %xmm6, 112(%rcx) 1749; AVX-NEXT: vmovdqa %xmm0, 128(%rcx) 1750; AVX-NEXT: vmovdqa %xmm10, 144(%rcx) 1751; AVX-NEXT: addq $24, %rsp 1752; AVX-NEXT: retq 1753; 1754; AVX2-LABEL: store_i8_stride3_vf64: 1755; AVX2: # %bb.0: 1756; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1757; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1758; AVX2-NEXT: vmovdqa (%rsi), %ymm2 1759; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 1760; AVX2-NEXT: vmovdqa (%rdx), %ymm4 1761; AVX2-NEXT: vmovdqa 32(%rdx), %ymm5 1762; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1763; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] 1764; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1765; AVX2-NEXT: # ymm8 = mem[0,1,0,1] 1766; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 1767; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1768; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] 1769; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 1770; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] 1771; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] 1772; AVX2-NEXT: # ymm11 = mem[0,1,0,1] 1773; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0 1774; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26] 1775; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 1776; AVX2-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm4[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1777; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255] 1778; AVX2-NEXT: # ymm12 = mem[0,1,0,1] 1779; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm11 1780; AVX2-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1781; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 1782; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1783; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1784; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1785; AVX2-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] 1786; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 1787; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1788; AVX2-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] 1789; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 1790; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 1791; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 1792; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 1793; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1794; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 1795; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6 1796; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 1797; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 1798; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 1799; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 1800; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 1801; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5 1802; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 1803; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3 1804; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 1805; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1806; AVX2-NEXT: vmovdqa %ymm2, 32(%rcx) 1807; AVX2-NEXT: vmovdqa %ymm3, 128(%rcx) 1808; AVX2-NEXT: vmovdqa %ymm0, 64(%rcx) 1809; AVX2-NEXT: vmovdqa %ymm1, 160(%rcx) 1810; AVX2-NEXT: vmovdqa %ymm6, (%rcx) 1811; AVX2-NEXT: vmovdqa %ymm5, 96(%rcx) 1812; AVX2-NEXT: vzeroupper 1813; AVX2-NEXT: retq 1814; 1815; AVX2-FP-LABEL: store_i8_stride3_vf64: 1816; AVX2-FP: # %bb.0: 1817; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1818; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 1819; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 1820; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3 1821; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm4 1822; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm5 1823; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1824; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] 1825; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1826; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] 1827; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 1828; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1829; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] 1830; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 1831; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] 1832; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] 1833; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] 1834; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0 1835; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26] 1836; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 1837; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm4[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1838; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255] 1839; AVX2-FP-NEXT: # ymm12 = mem[0,1,0,1] 1840; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm11 1841; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1842; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 1843; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1844; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1845; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1846; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] 1847; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 1848; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1849; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] 1850; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 1851; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 1852; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 1853; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 1854; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1855; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] 1856; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 1857; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 1858; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 1859; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 1860; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 1861; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 1862; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 1863; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 1864; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 1865; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 1866; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1867; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rcx) 1868; AVX2-FP-NEXT: vmovdqa %ymm3, 128(%rcx) 1869; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rcx) 1870; AVX2-FP-NEXT: vmovdqa %ymm1, 160(%rcx) 1871; AVX2-FP-NEXT: vmovdqa %ymm6, (%rcx) 1872; AVX2-FP-NEXT: vmovdqa %ymm5, 96(%rcx) 1873; AVX2-FP-NEXT: vzeroupper 1874; AVX2-FP-NEXT: retq 1875; 1876; AVX2-FCP-LABEL: store_i8_stride3_vf64: 1877; AVX2-FCP: # %bb.0: 1878; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1879; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1880; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 1881; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 1882; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4 1883; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 1884; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1885; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] 1886; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1887; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 1888; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 1889; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1890; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] 1891; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 1892; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] 1893; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] 1894; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] 1895; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0 1896; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26] 1897; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 1898; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm4[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1899; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255] 1900; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] 1901; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm11 1902; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1903; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 1904; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1905; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1906; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1907; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] 1908; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 1909; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero 1910; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] 1911; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 1912; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 1913; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 1914; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 1915; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1916; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] 1917; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 1918; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 1919; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 1920; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 1921; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 1922; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 1923; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 1924; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 1925; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 1926; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 1927; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1928; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rcx) 1929; AVX2-FCP-NEXT: vmovdqa %ymm3, 128(%rcx) 1930; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 1931; AVX2-FCP-NEXT: vmovdqa %ymm1, 160(%rcx) 1932; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rcx) 1933; AVX2-FCP-NEXT: vmovdqa %ymm5, 96(%rcx) 1934; AVX2-FCP-NEXT: vzeroupper 1935; AVX2-FCP-NEXT: retq 1936; 1937; AVX512-LABEL: store_i8_stride3_vf64: 1938; AVX512: # %bb.0: 1939; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1940; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 1941; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1942; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1943; AVX512-NEXT: vmovdqa (%rsi), %ymm2 1944; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 1945; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1946; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1947; AVX512-NEXT: vmovdqa (%rdx), %ymm6 1948; AVX512-NEXT: vmovdqa 32(%rdx), %ymm7 1949; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 1950; AVX512-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] 1951; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1952; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1953; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 1954; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1955; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1956; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1957; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1958; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 1959; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 1960; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] 1961; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 1962; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1963; AVX512-NEXT: # ymm7 = mem[0,1,0,1] 1964; AVX512-NEXT: vpshufb %ymm7, %ymm6, %ymm6 1965; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 1966; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm2 1967; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 1968; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0 1969; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 1970; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 1971; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 1972; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3 1973; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 1974; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1975; AVX512-NEXT: vmovdqa %ymm3, 128(%rcx) 1976; AVX512-NEXT: vmovdqa %ymm1, 160(%rcx) 1977; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx) 1978; AVX512-NEXT: vmovdqa %ymm2, 32(%rcx) 1979; AVX512-NEXT: vmovdqa %ymm5, 96(%rcx) 1980; AVX512-NEXT: vmovdqa %ymm6, (%rcx) 1981; AVX512-NEXT: vzeroupper 1982; AVX512-NEXT: retq 1983; 1984; AVX512-FCP-LABEL: store_i8_stride3_vf64: 1985; AVX512-FCP: # %bb.0: 1986; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1987; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1988; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1989; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1990; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 1991; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 1992; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1993; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1994; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6 1995; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 1996; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 1997; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] 1998; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1999; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 2000; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 2001; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 2002; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 2003; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 2004; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 2005; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 2006; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 2007; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] 2008; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 2009; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2010; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] 2011; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 2012; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 2013; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 2014; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 2015; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 2016; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 2017; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 2018; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 2019; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 2020; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 2021; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 2022; AVX512-FCP-NEXT: vmovdqa %ymm3, 128(%rcx) 2023; AVX512-FCP-NEXT: vmovdqa %ymm1, 160(%rcx) 2024; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 2025; AVX512-FCP-NEXT: vmovdqa %ymm2, 32(%rcx) 2026; AVX512-FCP-NEXT: vmovdqa %ymm5, 96(%rcx) 2027; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) 2028; AVX512-FCP-NEXT: vzeroupper 2029; AVX512-FCP-NEXT: retq 2030; 2031; AVX512DQ-LABEL: store_i8_stride3_vf64: 2032; AVX512DQ: # %bb.0: 2033; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 2034; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 2035; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 2036; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 2037; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 2038; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 2039; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 2040; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 2041; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm6 2042; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm7 2043; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 2044; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] 2045; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 2046; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 2047; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 2048; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 2049; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 2050; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 2051; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 2052; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 2053; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 2054; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] 2055; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 2056; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2057; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] 2058; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm6 2059; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 2060; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm2 2061; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 2062; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 2063; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 2064; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 2065; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 2066; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm3 2067; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 2068; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 2069; AVX512DQ-NEXT: vmovdqa %ymm3, 128(%rcx) 2070; AVX512DQ-NEXT: vmovdqa %ymm1, 160(%rcx) 2071; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rcx) 2072; AVX512DQ-NEXT: vmovdqa %ymm2, 32(%rcx) 2073; AVX512DQ-NEXT: vmovdqa %ymm5, 96(%rcx) 2074; AVX512DQ-NEXT: vmovdqa %ymm6, (%rcx) 2075; AVX512DQ-NEXT: vzeroupper 2076; AVX512DQ-NEXT: retq 2077; 2078; AVX512DQ-FCP-LABEL: store_i8_stride3_vf64: 2079; AVX512DQ-FCP: # %bb.0: 2080; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 2081; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 2082; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 2083; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 2084; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 2085; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 2086; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 2087; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 2088; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm6 2089; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 2090; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 2091; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] 2092; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 2093; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 2094; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 2095; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 2096; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 2097; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 2098; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 2099; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 2100; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] 2101; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] 2102; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 2103; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2104; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] 2105; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 2106; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] 2107; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 2108; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] 2109; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 2110; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm5 2111; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 2112; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 2113; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 2114; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 2115; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 2116; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 128(%rcx) 2117; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 160(%rcx) 2118; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) 2119; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 32(%rcx) 2120; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, 96(%rcx) 2121; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) 2122; AVX512DQ-FCP-NEXT: vzeroupper 2123; AVX512DQ-FCP-NEXT: retq 2124; 2125; AVX512BW-LABEL: store_i8_stride3_vf64: 2126; AVX512BW: # %bb.0: 2127; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2128; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 2129; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 2130; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53] 2131; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] 2132; AVX512BW-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2133; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] 2134; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] 2135; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] 2136; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2137; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52] 2138; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 2139; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2140; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3] 2141; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2142; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2143; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 2144; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 2145; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2146; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 2147; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 2148; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2149; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2150; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 2151; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 2152; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm4 2153; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2154; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 2155; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) 2156; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rcx) 2157; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rcx) 2158; AVX512BW-NEXT: vzeroupper 2159; AVX512BW-NEXT: retq 2160; 2161; AVX512BW-FCP-LABEL: store_i8_stride3_vf64: 2162; AVX512BW-FCP: # %bb.0: 2163; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 2164; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 2165; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 2166; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53] 2167; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] 2168; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2169; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] 2170; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] 2171; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] 2172; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2173; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52] 2174; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 2175; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2176; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3] 2177; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2178; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2179; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 2180; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm2 2181; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2182; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 2183; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 2184; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2185; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2186; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2 2187; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 2188; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm4, %zmm4 2189; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2190; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm0, %zmm0 2191; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) 2192; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rcx) 2193; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 2194; AVX512BW-FCP-NEXT: vzeroupper 2195; AVX512BW-FCP-NEXT: retq 2196; 2197; AVX512DQ-BW-LABEL: store_i8_stride3_vf64: 2198; AVX512DQ-BW: # %bb.0: 2199; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 2200; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 2201; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 2202; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53] 2203; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] 2204; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2205; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] 2206; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] 2207; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] 2208; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2209; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52] 2210; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 2211; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2212; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3] 2213; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2214; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2215; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 2216; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2 2217; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2218; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 2219; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 2220; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2221; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2222; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 2223; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 2224; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm4, %zmm4 2225; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2226; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 2227; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) 2228; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rcx) 2229; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rcx) 2230; AVX512DQ-BW-NEXT: vzeroupper 2231; AVX512DQ-BW-NEXT: retq 2232; 2233; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf64: 2234; AVX512DQ-BW-FCP: # %bb.0: 2235; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 2236; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 2237; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 2238; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53] 2239; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] 2240; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2241; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] 2242; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] 2243; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] 2244; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 2245; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52] 2246; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 2247; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2248; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3] 2249; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2250; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2251; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 2252; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm2 2253; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2254; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 2255; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 2256; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 2257; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 2258; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2 2259; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 2260; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm4, %zmm4 2261; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2262; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm0, %zmm0 2263; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) 2264; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rcx) 2265; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 2266; AVX512DQ-BW-FCP-NEXT: vzeroupper 2267; AVX512DQ-BW-FCP-NEXT: retq 2268 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 2269 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 2270 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 2271 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 2272 %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2273 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191> 2274 %interleaved.vec = shufflevector <192 x i8> %3, <192 x i8> poison, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191> 2275 store <192 x i8> %interleaved.vec, ptr %out.vec, align 64 2276 ret void 2277} 2278