1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i8_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i8_stride4_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: movdqa (%rdx), %xmm1 23; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 24; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 25; SSE-NEXT: pxor %xmm2, %xmm2 26; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 27; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] 28; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] 29; SSE-NEXT: packuswb %xmm1, %xmm1 30; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] 31; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 32; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] 33; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 34; SSE-NEXT: packuswb %xmm0, %xmm0 35; SSE-NEXT: pand %xmm3, %xmm0 36; SSE-NEXT: pandn %xmm1, %xmm3 37; SSE-NEXT: por %xmm0, %xmm3 38; SSE-NEXT: movq %xmm3, (%r8) 39; SSE-NEXT: retq 40; 41; AVX-LABEL: store_i8_stride4_vf2: 42; AVX: # %bb.0: 43; AVX-NEXT: vmovdqa (%rdi), %xmm0 44; AVX-NEXT: vmovdqa (%rdx), %xmm1 45; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 46; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 47; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 48; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 49; AVX-NEXT: vmovq %xmm0, (%r8) 50; AVX-NEXT: retq 51; 52; AVX2-LABEL: store_i8_stride4_vf2: 53; AVX2: # %bb.0: 54; AVX2-NEXT: vmovdqa (%rdi), %xmm0 55; AVX2-NEXT: vmovdqa (%rdx), %xmm1 56; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 57; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 58; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 59; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 60; AVX2-NEXT: vmovq %xmm0, (%r8) 61; AVX2-NEXT: retq 62; 63; AVX2-FP-LABEL: store_i8_stride4_vf2: 64; AVX2-FP: # %bb.0: 65; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 66; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 67; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 68; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 69; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 70; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 71; AVX2-FP-NEXT: vmovq %xmm0, (%r8) 72; AVX2-FP-NEXT: retq 73; 74; AVX2-FCP-LABEL: store_i8_stride4_vf2: 75; AVX2-FCP: # %bb.0: 76; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 77; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 78; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 79; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 80; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 81; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 82; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) 83; AVX2-FCP-NEXT: retq 84; 85; AVX512-LABEL: store_i8_stride4_vf2: 86; AVX512: # %bb.0: 87; AVX512-NEXT: vmovdqa (%rdi), %xmm0 88; AVX512-NEXT: vmovdqa (%rdx), %xmm1 89; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 90; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 91; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 92; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 93; AVX512-NEXT: vmovq %xmm0, (%r8) 94; AVX512-NEXT: retq 95; 96; AVX512-FCP-LABEL: store_i8_stride4_vf2: 97; AVX512-FCP: # %bb.0: 98; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 99; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 100; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 101; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 102; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 103; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 104; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) 105; AVX512-FCP-NEXT: retq 106; 107; AVX512DQ-LABEL: store_i8_stride4_vf2: 108; AVX512DQ: # %bb.0: 109; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 110; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 111; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 112; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 113; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 114; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 115; AVX512DQ-NEXT: vmovq %xmm0, (%r8) 116; AVX512DQ-NEXT: retq 117; 118; AVX512DQ-FCP-LABEL: store_i8_stride4_vf2: 119; AVX512DQ-FCP: # %bb.0: 120; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 121; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 122; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 123; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 124; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 125; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 126; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) 127; AVX512DQ-FCP-NEXT: retq 128; 129; AVX512BW-LABEL: store_i8_stride4_vf2: 130; AVX512BW: # %bb.0: 131; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 132; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 133; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 134; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 135; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 136; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 137; AVX512BW-NEXT: vmovq %xmm0, (%r8) 138; AVX512BW-NEXT: retq 139; 140; AVX512BW-FCP-LABEL: store_i8_stride4_vf2: 141; AVX512BW-FCP: # %bb.0: 142; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 143; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 144; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 145; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 146; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 147; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 148; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) 149; AVX512BW-FCP-NEXT: retq 150; 151; AVX512DQ-BW-LABEL: store_i8_stride4_vf2: 152; AVX512DQ-BW: # %bb.0: 153; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 154; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 155; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 156; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 157; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 158; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 159; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) 160; AVX512DQ-BW-NEXT: retq 161; 162; AVX512DQ-BW-FCP-LABEL: store_i8_stride4_vf2: 163; AVX512DQ-BW-FCP: # %bb.0: 164; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 165; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 166; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 167; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 168; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 169; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,1,5,2,6,3,7,u,u,u,u,u,u,u,u] 170; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) 171; AVX512DQ-BW-FCP-NEXT: retq 172 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 173 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 174 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 175 %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64 176 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 177 %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 178 %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 179 %interleaved.vec = shufflevector <8 x i8> %3, <8 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 180 store <8 x i8> %interleaved.vec, ptr %out.vec, align 64 181 ret void 182} 183 184define void @store_i8_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { 185; SSE-LABEL: store_i8_stride4_vf4: 186; SSE: # %bb.0: 187; SSE-NEXT: movdqa (%rdi), %xmm0 188; SSE-NEXT: movdqa (%rdx), %xmm1 189; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 190; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 191; SSE-NEXT: pxor %xmm2, %xmm2 192; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 193; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,1,3] 194; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] 195; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] 196; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,0] 197; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] 198; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] 199; SSE-NEXT: packuswb %xmm3, %xmm1 200; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] 201; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 202; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,1,3] 203; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] 204; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] 205; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] 206; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 207; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 208; SSE-NEXT: packuswb %xmm2, %xmm0 209; SSE-NEXT: pand %xmm3, %xmm0 210; SSE-NEXT: pandn %xmm1, %xmm3 211; SSE-NEXT: por %xmm0, %xmm3 212; SSE-NEXT: movdqa %xmm3, (%r8) 213; SSE-NEXT: retq 214; 215; AVX-LABEL: store_i8_stride4_vf4: 216; AVX: # %bb.0: 217; AVX-NEXT: vmovdqa (%rdi), %xmm0 218; AVX-NEXT: vmovdqa (%rdx), %xmm1 219; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 220; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 221; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 222; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 223; AVX-NEXT: vmovdqa %xmm0, (%r8) 224; AVX-NEXT: retq 225; 226; AVX2-LABEL: store_i8_stride4_vf4: 227; AVX2: # %bb.0: 228; AVX2-NEXT: vmovdqa (%rdi), %xmm0 229; AVX2-NEXT: vmovdqa (%rdx), %xmm1 230; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 231; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 232; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 233; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 234; AVX2-NEXT: vmovdqa %xmm0, (%r8) 235; AVX2-NEXT: retq 236; 237; AVX2-FP-LABEL: store_i8_stride4_vf4: 238; AVX2-FP: # %bb.0: 239; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 240; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 241; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 242; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 243; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 244; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 245; AVX2-FP-NEXT: vmovdqa %xmm0, (%r8) 246; AVX2-FP-NEXT: retq 247; 248; AVX2-FCP-LABEL: store_i8_stride4_vf4: 249; AVX2-FCP: # %bb.0: 250; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 251; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 252; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 253; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 254; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 255; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 256; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8) 257; AVX2-FCP-NEXT: retq 258; 259; AVX512-LABEL: store_i8_stride4_vf4: 260; AVX512: # %bb.0: 261; AVX512-NEXT: vmovdqa (%rdi), %xmm0 262; AVX512-NEXT: vmovdqa (%rdx), %xmm1 263; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 264; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 265; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 266; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 267; AVX512-NEXT: vmovdqa %xmm0, (%r8) 268; AVX512-NEXT: retq 269; 270; AVX512-FCP-LABEL: store_i8_stride4_vf4: 271; AVX512-FCP: # %bb.0: 272; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 273; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 274; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 275; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 276; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 277; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 278; AVX512-FCP-NEXT: vmovdqa %xmm0, (%r8) 279; AVX512-FCP-NEXT: retq 280; 281; AVX512DQ-LABEL: store_i8_stride4_vf4: 282; AVX512DQ: # %bb.0: 283; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 284; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 285; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 286; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 287; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 288; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 289; AVX512DQ-NEXT: vmovdqa %xmm0, (%r8) 290; AVX512DQ-NEXT: retq 291; 292; AVX512DQ-FCP-LABEL: store_i8_stride4_vf4: 293; AVX512DQ-FCP: # %bb.0: 294; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 295; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 296; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 297; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 298; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 299; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 300; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%r8) 301; AVX512DQ-FCP-NEXT: retq 302; 303; AVX512BW-LABEL: store_i8_stride4_vf4: 304; AVX512BW: # %bb.0: 305; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 306; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 307; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 308; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 309; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 310; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 311; AVX512BW-NEXT: vmovdqa %xmm0, (%r8) 312; AVX512BW-NEXT: retq 313; 314; AVX512BW-FCP-LABEL: store_i8_stride4_vf4: 315; AVX512BW-FCP: # %bb.0: 316; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 317; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 318; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 319; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 320; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 321; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 322; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r8) 323; AVX512BW-FCP-NEXT: retq 324; 325; AVX512DQ-BW-LABEL: store_i8_stride4_vf4: 326; AVX512DQ-BW: # %bb.0: 327; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 328; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 329; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 330; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 331; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 332; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 333; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%r8) 334; AVX512DQ-BW-NEXT: retq 335; 336; AVX512DQ-BW-FCP-LABEL: store_i8_stride4_vf4: 337; AVX512DQ-BW-FCP: # %bb.0: 338; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 339; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 340; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 341; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 342; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 343; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 344; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r8) 345; AVX512DQ-BW-FCP-NEXT: retq 346 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 347 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 348 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 349 %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64 350 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 351 %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 352 %3 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 353 %interleaved.vec = shufflevector <16 x i8> %3, <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 354 store <16 x i8> %interleaved.vec, ptr %out.vec, align 64 355 ret void 356} 357 358define void @store_i8_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { 359; SSE-LABEL: store_i8_stride4_vf8: 360; SSE: # %bb.0: 361; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 362; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 363; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 364; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 365; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 366; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 367; SSE-NEXT: movdqa %xmm0, %xmm2 368; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 369; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 370; SSE-NEXT: movdqa %xmm0, 16(%r8) 371; SSE-NEXT: movdqa %xmm2, (%r8) 372; SSE-NEXT: retq 373; 374; AVX-LABEL: store_i8_stride4_vf8: 375; AVX: # %bb.0: 376; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 377; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 378; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 379; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 380; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 381; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 382; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 383; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 384; AVX-NEXT: vmovdqa %xmm0, 16(%r8) 385; AVX-NEXT: vmovdqa %xmm2, (%r8) 386; AVX-NEXT: retq 387; 388; AVX2-LABEL: store_i8_stride4_vf8: 389; AVX2: # %bb.0: 390; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 391; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 392; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 393; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 394; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 395; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 396; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 397; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 398; AVX2-NEXT: vmovdqa %xmm0, 16(%r8) 399; AVX2-NEXT: vmovdqa %xmm2, (%r8) 400; AVX2-NEXT: retq 401; 402; AVX2-FP-LABEL: store_i8_stride4_vf8: 403; AVX2-FP: # %bb.0: 404; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 405; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 406; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 407; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 408; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 409; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 410; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 411; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 412; AVX2-FP-NEXT: vmovdqa %xmm0, 16(%r8) 413; AVX2-FP-NEXT: vmovdqa %xmm2, (%r8) 414; AVX2-FP-NEXT: retq 415; 416; AVX2-FCP-LABEL: store_i8_stride4_vf8: 417; AVX2-FCP: # %bb.0: 418; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 419; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 420; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 421; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 422; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 423; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 424; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 425; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 426; AVX2-FCP-NEXT: vmovdqa %xmm0, 16(%r8) 427; AVX2-FCP-NEXT: vmovdqa %xmm2, (%r8) 428; AVX2-FCP-NEXT: retq 429; 430; AVX512-LABEL: store_i8_stride4_vf8: 431; AVX512: # %bb.0: 432; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 433; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 434; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 435; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 436; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 437; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 438; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 439; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 440; AVX512-NEXT: vmovdqa %xmm0, 16(%r8) 441; AVX512-NEXT: vmovdqa %xmm2, (%r8) 442; AVX512-NEXT: retq 443; 444; AVX512-FCP-LABEL: store_i8_stride4_vf8: 445; AVX512-FCP: # %bb.0: 446; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 447; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 448; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 449; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 450; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 451; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 452; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 453; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 454; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%r8) 455; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r8) 456; AVX512-FCP-NEXT: retq 457; 458; AVX512DQ-LABEL: store_i8_stride4_vf8: 459; AVX512DQ: # %bb.0: 460; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 461; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 462; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 463; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 464; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 465; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 466; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 467; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 468; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%r8) 469; AVX512DQ-NEXT: vmovdqa %xmm2, (%r8) 470; AVX512DQ-NEXT: retq 471; 472; AVX512DQ-FCP-LABEL: store_i8_stride4_vf8: 473; AVX512DQ-FCP: # %bb.0: 474; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 475; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 476; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 477; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 478; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 479; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 480; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 481; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 482; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%r8) 483; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r8) 484; AVX512DQ-FCP-NEXT: retq 485; 486; AVX512BW-LABEL: store_i8_stride4_vf8: 487; AVX512BW: # %bb.0: 488; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 489; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 490; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 491; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 492; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 493; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 494; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 495; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 496; AVX512BW-NEXT: vmovdqa %xmm0, 16(%r8) 497; AVX512BW-NEXT: vmovdqa %xmm2, (%r8) 498; AVX512BW-NEXT: retq 499; 500; AVX512BW-FCP-LABEL: store_i8_stride4_vf8: 501; AVX512BW-FCP: # %bb.0: 502; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 503; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 504; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 505; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 506; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 507; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 508; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 509; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 510; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 16(%r8) 511; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8) 512; AVX512BW-FCP-NEXT: retq 513; 514; AVX512DQ-BW-LABEL: store_i8_stride4_vf8: 515; AVX512DQ-BW: # %bb.0: 516; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 517; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 518; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 519; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 520; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 521; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 522; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 523; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 524; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 16(%r8) 525; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r8) 526; AVX512DQ-BW-NEXT: retq 527; 528; AVX512DQ-BW-FCP-LABEL: store_i8_stride4_vf8: 529; AVX512DQ-BW-FCP: # %bb.0: 530; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 531; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 532; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 533; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 534; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 535; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 536; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 537; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 538; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 16(%r8) 539; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8) 540; AVX512DQ-BW-FCP-NEXT: retq 541 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 542 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64 543 %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64 544 %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64 545 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 546 %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 547 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 548 %interleaved.vec = shufflevector <32 x i8> %3, <32 x i8> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 549 store <32 x i8> %interleaved.vec, ptr %out.vec, align 64 550 ret void 551} 552 553define void @store_i8_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { 554; SSE-LABEL: store_i8_stride4_vf16: 555; SSE: # %bb.0: 556; SSE-NEXT: movdqa (%rdi), %xmm0 557; SSE-NEXT: movdqa (%rsi), %xmm1 558; SSE-NEXT: movdqa (%rdx), %xmm2 559; SSE-NEXT: movdqa (%rcx), %xmm3 560; SSE-NEXT: movdqa %xmm2, %xmm4 561; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 562; SSE-NEXT: movdqa %xmm0, %xmm5 563; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 564; SSE-NEXT: movdqa %xmm5, %xmm6 565; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 566; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 567; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 568; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 569; SSE-NEXT: movdqa %xmm0, %xmm1 570; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 571; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 572; SSE-NEXT: movdqa %xmm0, 32(%r8) 573; SSE-NEXT: movdqa %xmm1, 48(%r8) 574; SSE-NEXT: movdqa %xmm5, 16(%r8) 575; SSE-NEXT: movdqa %xmm6, (%r8) 576; SSE-NEXT: retq 577; 578; AVX-LABEL: store_i8_stride4_vf16: 579; AVX: # %bb.0: 580; AVX-NEXT: vmovdqa (%rdi), %xmm0 581; AVX-NEXT: vmovdqa (%rsi), %xmm1 582; AVX-NEXT: vmovdqa (%rdx), %xmm2 583; AVX-NEXT: vmovdqa (%rcx), %xmm3 584; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 585; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 586; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 587; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 588; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 589; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 590; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 591; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 592; AVX-NEXT: vmovdqa %xmm4, 32(%r8) 593; AVX-NEXT: vmovdqa %xmm0, 48(%r8) 594; AVX-NEXT: vmovdqa %xmm1, 16(%r8) 595; AVX-NEXT: vmovdqa %xmm3, (%r8) 596; AVX-NEXT: retq 597; 598; AVX2-LABEL: store_i8_stride4_vf16: 599; AVX2: # %bb.0: 600; AVX2-NEXT: vmovdqa (%rdi), %xmm0 601; AVX2-NEXT: vmovdqa (%rsi), %xmm1 602; AVX2-NEXT: vmovdqa (%rdx), %xmm2 603; AVX2-NEXT: vmovdqa (%rcx), %xmm3 604; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 605; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 606; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 607; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 608; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 609; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 610; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 611; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 612; AVX2-NEXT: vmovdqa %xmm4, 32(%r8) 613; AVX2-NEXT: vmovdqa %xmm0, 48(%r8) 614; AVX2-NEXT: vmovdqa %xmm1, 16(%r8) 615; AVX2-NEXT: vmovdqa %xmm3, (%r8) 616; AVX2-NEXT: retq 617; 618; AVX2-FP-LABEL: store_i8_stride4_vf16: 619; AVX2-FP: # %bb.0: 620; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 621; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 622; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 623; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3 624; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 625; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 626; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 627; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 628; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 629; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 630; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 631; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 632; AVX2-FP-NEXT: vmovdqa %xmm4, 32(%r8) 633; AVX2-FP-NEXT: vmovdqa %xmm0, 48(%r8) 634; AVX2-FP-NEXT: vmovdqa %xmm1, 16(%r8) 635; AVX2-FP-NEXT: vmovdqa %xmm3, (%r8) 636; AVX2-FP-NEXT: retq 637; 638; AVX2-FCP-LABEL: store_i8_stride4_vf16: 639; AVX2-FCP: # %bb.0: 640; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 641; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 642; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 643; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 644; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 645; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 646; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 647; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 648; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 649; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 650; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 651; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 652; AVX2-FCP-NEXT: vmovdqa %xmm4, 32(%r8) 653; AVX2-FCP-NEXT: vmovdqa %xmm0, 48(%r8) 654; AVX2-FCP-NEXT: vmovdqa %xmm1, 16(%r8) 655; AVX2-FCP-NEXT: vmovdqa %xmm3, (%r8) 656; AVX2-FCP-NEXT: retq 657; 658; AVX512-LABEL: store_i8_stride4_vf16: 659; AVX512: # %bb.0: 660; AVX512-NEXT: vmovdqa (%rdi), %xmm0 661; AVX512-NEXT: vmovdqa (%rsi), %xmm1 662; AVX512-NEXT: vmovdqa (%rdx), %xmm2 663; AVX512-NEXT: vmovdqa (%rcx), %xmm3 664; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 665; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 666; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 667; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 668; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 669; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 670; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 671; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 672; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 673; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 674; AVX512-NEXT: vmovdqa %ymm0, 32(%r8) 675; AVX512-NEXT: vmovdqa %ymm1, (%r8) 676; AVX512-NEXT: vzeroupper 677; AVX512-NEXT: retq 678; 679; AVX512-FCP-LABEL: store_i8_stride4_vf16: 680; AVX512-FCP: # %bb.0: 681; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 682; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 683; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 684; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 685; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 686; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 687; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 688; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 689; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 690; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 691; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 692; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 693; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 694; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 695; AVX512-FCP-NEXT: vmovdqa %ymm0, 32(%r8) 696; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r8) 697; AVX512-FCP-NEXT: vzeroupper 698; AVX512-FCP-NEXT: retq 699; 700; AVX512DQ-LABEL: store_i8_stride4_vf16: 701; AVX512DQ: # %bb.0: 702; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 703; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 704; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 705; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 706; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 707; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 708; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 709; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 710; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 711; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 712; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 713; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 714; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 715; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 716; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%r8) 717; AVX512DQ-NEXT: vmovdqa %ymm1, (%r8) 718; AVX512DQ-NEXT: vzeroupper 719; AVX512DQ-NEXT: retq 720; 721; AVX512DQ-FCP-LABEL: store_i8_stride4_vf16: 722; AVX512DQ-FCP: # %bb.0: 723; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 724; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 725; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 726; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 727; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 728; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 729; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 730; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 731; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 732; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 733; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 734; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 735; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 736; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 737; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 32(%r8) 738; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r8) 739; AVX512DQ-FCP-NEXT: vzeroupper 740; AVX512DQ-FCP-NEXT: retq 741; 742; AVX512BW-LABEL: store_i8_stride4_vf16: 743; AVX512BW: # %bb.0: 744; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 745; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 746; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 747; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 748; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 749; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 750; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 751; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 752; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 753; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 754; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 755; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 756; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 757; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 758; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 759; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) 760; AVX512BW-NEXT: vzeroupper 761; AVX512BW-NEXT: retq 762; 763; AVX512BW-FCP-LABEL: store_i8_stride4_vf16: 764; AVX512BW-FCP: # %bb.0: 765; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 766; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 767; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 768; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 769; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 770; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 771; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 772; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 773; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 774; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 775; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 776; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 777; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 778; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 779; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 780; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) 781; AVX512BW-FCP-NEXT: vzeroupper 782; AVX512BW-FCP-NEXT: retq 783; 784; AVX512DQ-BW-LABEL: store_i8_stride4_vf16: 785; AVX512DQ-BW: # %bb.0: 786; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 787; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 788; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 789; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 790; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 791; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 792; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 793; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 794; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 795; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 796; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 797; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 798; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 799; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 800; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 801; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) 802; AVX512DQ-BW-NEXT: vzeroupper 803; AVX512DQ-BW-NEXT: retq 804; 805; AVX512DQ-BW-FCP-LABEL: store_i8_stride4_vf16: 806; AVX512DQ-BW-FCP: # %bb.0: 807; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 808; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 809; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 810; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 811; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 812; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 813; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 814; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 815; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 816; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 817; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 818; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 819; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 820; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 821; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 822; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) 823; AVX512DQ-BW-FCP-NEXT: vzeroupper 824; AVX512DQ-BW-FCP-NEXT: retq 825 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 826 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 827 %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64 828 %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64 829 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 830 %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 831 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 832 %interleaved.vec = shufflevector <64 x i8> %3, <64 x i8> poison, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 833 store <64 x i8> %interleaved.vec, ptr %out.vec, align 64 834 ret void 835} 836 837define void @store_i8_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { 838; SSE-LABEL: store_i8_stride4_vf32: 839; SSE: # %bb.0: 840; SSE-NEXT: movdqa (%rdi), %xmm0 841; SSE-NEXT: movdqa 16(%rdi), %xmm1 842; SSE-NEXT: movdqa (%rsi), %xmm5 843; SSE-NEXT: movdqa 16(%rsi), %xmm6 844; SSE-NEXT: movdqa (%rdx), %xmm7 845; SSE-NEXT: movdqa 16(%rdx), %xmm4 846; SSE-NEXT: movdqa (%rcx), %xmm8 847; SSE-NEXT: movdqa 16(%rcx), %xmm9 848; SSE-NEXT: movdqa %xmm7, %xmm10 849; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] 850; SSE-NEXT: movdqa %xmm0, %xmm2 851; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 852; SSE-NEXT: movdqa %xmm2, %xmm3 853; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 854; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] 855; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] 856; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 857; SSE-NEXT: movdqa %xmm0, %xmm5 858; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 859; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 860; SSE-NEXT: movdqa %xmm4, %xmm7 861; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 862; SSE-NEXT: movdqa %xmm1, %xmm8 863; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 864; SSE-NEXT: movdqa %xmm8, %xmm10 865; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] 866; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 867; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 868; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 869; SSE-NEXT: movdqa %xmm1, %xmm6 870; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 871; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 872; SSE-NEXT: movdqa %xmm1, 96(%r8) 873; SSE-NEXT: movdqa %xmm6, 112(%r8) 874; SSE-NEXT: movdqa %xmm8, 64(%r8) 875; SSE-NEXT: movdqa %xmm10, 80(%r8) 876; SSE-NEXT: movdqa %xmm0, 32(%r8) 877; SSE-NEXT: movdqa %xmm5, 48(%r8) 878; SSE-NEXT: movdqa %xmm2, (%r8) 879; SSE-NEXT: movdqa %xmm3, 16(%r8) 880; SSE-NEXT: retq 881; 882; AVX-LABEL: store_i8_stride4_vf32: 883; AVX: # %bb.0: 884; AVX-NEXT: vmovdqa (%rsi), %xmm0 885; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 886; AVX-NEXT: vmovdqa (%rdi), %xmm2 887; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 888; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 889; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 890; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 891; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 892; AVX-NEXT: vmovdqa (%rcx), %xmm2 893; AVX-NEXT: vmovdqa 16(%rcx), %xmm3 894; AVX-NEXT: vmovdqa (%rdx), %xmm6 895; AVX-NEXT: vmovdqa 16(%rdx), %xmm7 896; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 897; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 898; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 899; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 900; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] 901; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] 902; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 903; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 904; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 905; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 906; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 907; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 908; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm2 909; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 910; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm3 911; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 912; AVX-NEXT: vmovaps %ymm2, (%r8) 913; AVX-NEXT: vmovaps %ymm1, 96(%r8) 914; AVX-NEXT: vmovaps %ymm3, 64(%r8) 915; AVX-NEXT: vmovaps %ymm0, 32(%r8) 916; AVX-NEXT: vzeroupper 917; AVX-NEXT: retq 918; 919; AVX2-LABEL: store_i8_stride4_vf32: 920; AVX2: # %bb.0: 921; AVX2-NEXT: vmovdqa (%rdi), %ymm0 922; AVX2-NEXT: vmovdqa (%rsi), %ymm1 923; AVX2-NEXT: vmovdqa (%rdx), %ymm2 924; AVX2-NEXT: vmovdqa (%rcx), %ymm3 925; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 926; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 927; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 928; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 929; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 930; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 931; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 932; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 933; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 934; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 935; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 936; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 937; AVX2-NEXT: vmovdqa %ymm0, 96(%r8) 938; AVX2-NEXT: vmovdqa %ymm1, 64(%r8) 939; AVX2-NEXT: vmovdqa %ymm5, 32(%r8) 940; AVX2-NEXT: vmovdqa %ymm2, (%r8) 941; AVX2-NEXT: vzeroupper 942; AVX2-NEXT: retq 943; 944; AVX2-FP-LABEL: store_i8_stride4_vf32: 945; AVX2-FP: # %bb.0: 946; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 947; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 948; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 949; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm3 950; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 951; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 952; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 953; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 954; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 955; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 956; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 957; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 958; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 959; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 960; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 961; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 962; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%r8) 963; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%r8) 964; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r8) 965; AVX2-FP-NEXT: vmovdqa %ymm2, (%r8) 966; AVX2-FP-NEXT: vzeroupper 967; AVX2-FP-NEXT: retq 968; 969; AVX2-FCP-LABEL: store_i8_stride4_vf32: 970; AVX2-FCP: # %bb.0: 971; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 972; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 973; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm2 974; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm3 975; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 976; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 977; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 978; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 979; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 980; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 981; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 982; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 983; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 984; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 985; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 986; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 987; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%r8) 988; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%r8) 989; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r8) 990; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r8) 991; AVX2-FCP-NEXT: vzeroupper 992; AVX2-FCP-NEXT: retq 993; 994; AVX512-LABEL: store_i8_stride4_vf32: 995; AVX512: # %bb.0: 996; AVX512-NEXT: vmovdqa (%rdi), %ymm0 997; AVX512-NEXT: vmovdqa (%rsi), %ymm1 998; AVX512-NEXT: vmovdqa (%rdx), %ymm2 999; AVX512-NEXT: vmovdqa (%rcx), %ymm3 1000; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1001; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1002; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1003; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1004; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1005; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1006; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1007; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1008; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1009; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1010; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 1011; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 1012; AVX512-NEXT: vmovdqa %ymm0, 96(%r8) 1013; AVX512-NEXT: vmovdqa %ymm1, 64(%r8) 1014; AVX512-NEXT: vmovdqa %ymm5, 32(%r8) 1015; AVX512-NEXT: vmovdqa %ymm2, (%r8) 1016; AVX512-NEXT: vzeroupper 1017; AVX512-NEXT: retq 1018; 1019; AVX512-FCP-LABEL: store_i8_stride4_vf32: 1020; AVX512-FCP: # %bb.0: 1021; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1022; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 1023; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 1024; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3 1025; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1026; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1027; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1028; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1029; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1030; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1031; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1032; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1033; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1034; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1035; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 1036; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 1037; AVX512-FCP-NEXT: vmovdqa %ymm0, 96(%r8) 1038; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%r8) 1039; AVX512-FCP-NEXT: vmovdqa %ymm5, 32(%r8) 1040; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) 1041; AVX512-FCP-NEXT: vzeroupper 1042; AVX512-FCP-NEXT: retq 1043; 1044; AVX512DQ-LABEL: store_i8_stride4_vf32: 1045; AVX512DQ: # %bb.0: 1046; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1047; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 1048; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 1049; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm3 1050; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1051; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1052; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1053; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1054; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1055; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1056; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1057; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1058; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1059; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1060; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 1061; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 1062; AVX512DQ-NEXT: vmovdqa %ymm0, 96(%r8) 1063; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%r8) 1064; AVX512DQ-NEXT: vmovdqa %ymm5, 32(%r8) 1065; AVX512DQ-NEXT: vmovdqa %ymm2, (%r8) 1066; AVX512DQ-NEXT: vzeroupper 1067; AVX512DQ-NEXT: retq 1068; 1069; AVX512DQ-FCP-LABEL: store_i8_stride4_vf32: 1070; AVX512DQ-FCP: # %bb.0: 1071; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1072; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 1073; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 1074; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3 1075; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1076; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1077; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1078; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1079; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1080; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1081; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1082; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1083; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1084; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1085; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 1086; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 1087; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 96(%r8) 1088; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%r8) 1089; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, 32(%r8) 1090; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) 1091; AVX512DQ-FCP-NEXT: vzeroupper 1092; AVX512DQ-FCP-NEXT: retq 1093; 1094; AVX512BW-LABEL: store_i8_stride4_vf32: 1095; AVX512BW: # %bb.0: 1096; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1097; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1 1098; AVX512BW-NEXT: vmovdqa (%rdx), %ymm2 1099; AVX512BW-NEXT: vmovdqa (%rcx), %ymm3 1100; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1101; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1102; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1103; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1104; AVX512BW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1105; AVX512BW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1106; AVX512BW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1107; AVX512BW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1108; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1109; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1110; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1111; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 1112; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1113; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1114; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) 1115; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) 1116; AVX512BW-NEXT: vzeroupper 1117; AVX512BW-NEXT: retq 1118; 1119; AVX512BW-FCP-LABEL: store_i8_stride4_vf32: 1120; AVX512BW-FCP: # %bb.0: 1121; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1122; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm1 1123; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm2 1124; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm3 1125; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1126; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1127; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1128; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1129; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1130; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1131; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1132; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1133; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1134; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1135; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1136; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 1137; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1138; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1139; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) 1140; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) 1141; AVX512BW-FCP-NEXT: vzeroupper 1142; AVX512BW-FCP-NEXT: retq 1143; 1144; AVX512DQ-BW-LABEL: store_i8_stride4_vf32: 1145; AVX512DQ-BW: # %bb.0: 1146; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 1147; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm1 1148; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm2 1149; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm3 1150; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1151; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1152; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1153; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1154; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1155; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1156; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1157; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1158; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1159; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1160; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1161; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 1162; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1163; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1164; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8) 1165; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) 1166; AVX512DQ-BW-NEXT: vzeroupper 1167; AVX512DQ-BW-NEXT: retq 1168; 1169; AVX512DQ-BW-FCP-LABEL: store_i8_stride4_vf32: 1170; AVX512DQ-BW-FCP: # %bb.0: 1171; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1172; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm1 1173; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm2 1174; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm3 1175; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 1176; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 1177; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 1178; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 1179; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 1180; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 1181; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1182; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1183; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1184; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1185; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1186; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 1187; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 1188; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1189; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) 1190; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) 1191; AVX512DQ-BW-FCP-NEXT: vzeroupper 1192; AVX512DQ-BW-FCP-NEXT: retq 1193 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 1194 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64 1195 %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64 1196 %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64 1197 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1198 %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1199 %3 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1200 %interleaved.vec = shufflevector <128 x i8> %3, <128 x i8> poison, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127> 1201 store <128 x i8> %interleaved.vec, ptr %out.vec, align 64 1202 ret void 1203} 1204 1205define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { 1206; SSE-LABEL: store_i8_stride4_vf64: 1207; SSE: # %bb.0: 1208; SSE-NEXT: movdqa (%rdi), %xmm5 1209; SSE-NEXT: movdqa 16(%rdi), %xmm11 1210; SSE-NEXT: movdqa 32(%rdi), %xmm4 1211; SSE-NEXT: movdqa 48(%rdi), %xmm2 1212; SSE-NEXT: movdqa (%rsi), %xmm0 1213; SSE-NEXT: movdqa 16(%rsi), %xmm3 1214; SSE-NEXT: movdqa 32(%rsi), %xmm9 1215; SSE-NEXT: movdqa (%rdx), %xmm7 1216; SSE-NEXT: movdqa 16(%rdx), %xmm13 1217; SSE-NEXT: movdqa 32(%rdx), %xmm10 1218; SSE-NEXT: movdqa (%rcx), %xmm8 1219; SSE-NEXT: movdqa 16(%rcx), %xmm14 1220; SSE-NEXT: movdqa 32(%rcx), %xmm12 1221; SSE-NEXT: movdqa %xmm7, %xmm15 1222; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 1223; SSE-NEXT: movdqa %xmm5, %xmm6 1224; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 1225; SSE-NEXT: movdqa %xmm6, %xmm1 1226; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] 1227; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1228; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] 1229; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] 1230; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 1231; SSE-NEXT: movdqa %xmm5, %xmm0 1232; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 1233; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1234; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 1235; SSE-NEXT: movdqa %xmm13, %xmm15 1236; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 1237; SSE-NEXT: movdqa %xmm11, %xmm7 1238; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 1239; SSE-NEXT: movdqa %xmm7, %xmm0 1240; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] 1241; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1242; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] 1243; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] 1244; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] 1245; SSE-NEXT: movdqa %xmm11, %xmm8 1246; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] 1247; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] 1248; SSE-NEXT: movdqa %xmm10, %xmm15 1249; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] 1250; SSE-NEXT: movdqa %xmm4, %xmm13 1251; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] 1252; SSE-NEXT: movdqa %xmm13, %xmm14 1253; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] 1254; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] 1255; SSE-NEXT: movdqa 48(%rdx), %xmm15 1256; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] 1257; SSE-NEXT: movdqa 48(%rcx), %xmm12 1258; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1259; SSE-NEXT: movdqa %xmm4, %xmm9 1260; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 1261; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 1262; SSE-NEXT: movdqa %xmm15, %xmm10 1263; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] 1264; SSE-NEXT: movdqa 48(%rsi), %xmm1 1265; SSE-NEXT: movdqa %xmm2, %xmm3 1266; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1267; SSE-NEXT: movdqa %xmm3, %xmm0 1268; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 1269; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] 1270; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] 1271; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1272; SSE-NEXT: movdqa %xmm2, %xmm1 1273; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] 1274; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] 1275; SSE-NEXT: movdqa %xmm2, 224(%r8) 1276; SSE-NEXT: movdqa %xmm1, 240(%r8) 1277; SSE-NEXT: movdqa %xmm3, 192(%r8) 1278; SSE-NEXT: movdqa %xmm0, 208(%r8) 1279; SSE-NEXT: movdqa %xmm4, 160(%r8) 1280; SSE-NEXT: movdqa %xmm9, 176(%r8) 1281; SSE-NEXT: movdqa %xmm13, 128(%r8) 1282; SSE-NEXT: movdqa %xmm14, 144(%r8) 1283; SSE-NEXT: movdqa %xmm11, 96(%r8) 1284; SSE-NEXT: movdqa %xmm8, 112(%r8) 1285; SSE-NEXT: movdqa %xmm7, 64(%r8) 1286; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1287; SSE-NEXT: movaps %xmm0, 80(%r8) 1288; SSE-NEXT: movdqa %xmm5, 32(%r8) 1289; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1290; SSE-NEXT: movaps %xmm0, 48(%r8) 1291; SSE-NEXT: movdqa %xmm6, (%r8) 1292; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1293; SSE-NEXT: movaps %xmm0, 16(%r8) 1294; SSE-NEXT: retq 1295; 1296; AVX-LABEL: store_i8_stride4_vf64: 1297; AVX: # %bb.0: 1298; AVX-NEXT: vmovdqa (%rsi), %xmm0 1299; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 1300; AVX-NEXT: vmovdqa 32(%rsi), %xmm2 1301; AVX-NEXT: vmovdqa 48(%rsi), %xmm4 1302; AVX-NEXT: vmovdqa (%rdi), %xmm6 1303; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 1304; AVX-NEXT: vmovdqa 32(%rdi), %xmm9 1305; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 1306; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 1307; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 1308; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] 1309; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] 1310; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] 1311; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1312; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] 1313; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] 1314; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] 1315; AVX-NEXT: vmovdqa (%rcx), %xmm6 1316; AVX-NEXT: vmovdqa 16(%rcx), %xmm9 1317; AVX-NEXT: vmovdqa 32(%rcx), %xmm10 1318; AVX-NEXT: vmovdqa 48(%rcx), %xmm12 1319; AVX-NEXT: vmovdqa (%rdx), %xmm8 1320; AVX-NEXT: vmovdqa 16(%rdx), %xmm13 1321; AVX-NEXT: vmovdqa 32(%rdx), %xmm14 1322; AVX-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 1323; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] 1324; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] 1325; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 1326; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] 1327; AVX-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] 1328; AVX-NEXT: vmovdqa 48(%rdx), %xmm10 1329; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] 1330; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] 1331; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] 1332; AVX-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1333; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] 1334; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] 1335; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 1336; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 1337; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 1338; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] 1339; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] 1340; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] 1341; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 1342; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 1343; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1344; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] 1345; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] 1346; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1347; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 1348; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 1349; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 1350; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm6 1351; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 1352; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 1353; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 1354; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 1355; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1356; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 1357; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 1358; AVX-NEXT: vmovaps %ymm5, 64(%r8) 1359; AVX-NEXT: vmovaps %ymm1, 96(%r8) 1360; AVX-NEXT: vmovaps %ymm7, 128(%r8) 1361; AVX-NEXT: vmovaps %ymm0, 192(%r8) 1362; AVX-NEXT: vmovaps %ymm2, 160(%r8) 1363; AVX-NEXT: vmovaps %ymm4, 224(%r8) 1364; AVX-NEXT: vmovaps %ymm3, (%r8) 1365; AVX-NEXT: vmovaps %ymm6, 32(%r8) 1366; AVX-NEXT: vzeroupper 1367; AVX-NEXT: retq 1368; 1369; AVX2-LABEL: store_i8_stride4_vf64: 1370; AVX2: # %bb.0: 1371; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1372; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1373; AVX2-NEXT: vmovdqa (%rsi), %ymm2 1374; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 1375; AVX2-NEXT: vmovdqa (%rdx), %ymm4 1376; AVX2-NEXT: vmovdqa 32(%rdx), %ymm5 1377; AVX2-NEXT: vmovdqa (%rcx), %ymm6 1378; AVX2-NEXT: vmovdqa 32(%rcx), %ymm7 1379; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1380; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 1381; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1382; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 1383; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] 1384; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] 1385; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31] 1386; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] 1387; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] 1388; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] 1389; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] 1390; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] 1391; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 1392; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 1393; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 1394; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 1395; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4 1396; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5 1397; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 1398; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1399; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7 1400; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1401; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] 1402; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1403; AVX2-NEXT: vmovdqa %ymm3, 192(%r8) 1404; AVX2-NEXT: vmovdqa %ymm1, 224(%r8) 1405; AVX2-NEXT: vmovdqa %ymm2, 64(%r8) 1406; AVX2-NEXT: vmovdqa %ymm0, 96(%r8) 1407; AVX2-NEXT: vmovdqa %ymm7, 128(%r8) 1408; AVX2-NEXT: vmovdqa %ymm9, 160(%r8) 1409; AVX2-NEXT: vmovdqa %ymm4, (%r8) 1410; AVX2-NEXT: vmovdqa %ymm5, 32(%r8) 1411; AVX2-NEXT: vzeroupper 1412; AVX2-NEXT: retq 1413; 1414; AVX2-FP-LABEL: store_i8_stride4_vf64: 1415; AVX2-FP: # %bb.0: 1416; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1417; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 1418; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 1419; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3 1420; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm4 1421; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm5 1422; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm6 1423; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm7 1424; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1425; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 1426; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1427; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 1428; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] 1429; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] 1430; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31] 1431; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] 1432; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] 1433; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] 1434; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] 1435; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] 1436; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 1437; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 1438; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 1439; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 1440; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4 1441; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5 1442; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 1443; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1444; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7 1445; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1446; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] 1447; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1448; AVX2-FP-NEXT: vmovdqa %ymm3, 192(%r8) 1449; AVX2-FP-NEXT: vmovdqa %ymm1, 224(%r8) 1450; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%r8) 1451; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%r8) 1452; AVX2-FP-NEXT: vmovdqa %ymm7, 128(%r8) 1453; AVX2-FP-NEXT: vmovdqa %ymm9, 160(%r8) 1454; AVX2-FP-NEXT: vmovdqa %ymm4, (%r8) 1455; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r8) 1456; AVX2-FP-NEXT: vzeroupper 1457; AVX2-FP-NEXT: retq 1458; 1459; AVX2-FCP-LABEL: store_i8_stride4_vf64: 1460; AVX2-FCP: # %bb.0: 1461; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1462; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1463; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 1464; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 1465; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4 1466; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 1467; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6 1468; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 1469; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1470; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 1471; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1472; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 1473; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] 1474; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] 1475; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31] 1476; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] 1477; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] 1478; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] 1479; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] 1480; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] 1481; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 1482; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 1483; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 1484; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 1485; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4 1486; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5 1487; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 1488; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1489; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7 1490; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1491; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] 1492; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1493; AVX2-FCP-NEXT: vmovdqa %ymm3, 192(%r8) 1494; AVX2-FCP-NEXT: vmovdqa %ymm1, 224(%r8) 1495; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%r8) 1496; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%r8) 1497; AVX2-FCP-NEXT: vmovdqa %ymm7, 128(%r8) 1498; AVX2-FCP-NEXT: vmovdqa %ymm9, 160(%r8) 1499; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r8) 1500; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r8) 1501; AVX2-FCP-NEXT: vzeroupper 1502; AVX2-FCP-NEXT: retq 1503; 1504; AVX512-LABEL: store_i8_stride4_vf64: 1505; AVX512: # %bb.0: 1506; AVX512-NEXT: vmovdqa (%rsi), %ymm0 1507; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 1508; AVX512-NEXT: vmovdqa (%rdi), %ymm2 1509; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 1510; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] 1511; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] 1512; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] 1513; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] 1514; AVX512-NEXT: vmovdqa (%rcx), %ymm2 1515; AVX512-NEXT: vmovdqa 32(%rcx), %ymm3 1516; AVX512-NEXT: vmovdqa (%rdx), %ymm6 1517; AVX512-NEXT: vmovdqa 32(%rdx), %ymm7 1518; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[16],ymm2[16],ymm6[17],ymm2[17],ymm6[18],ymm2[18],ymm6[19],ymm2[19],ymm6[20],ymm2[20],ymm6[21],ymm2[21],ymm6[22],ymm2[22],ymm6[23],ymm2[23] 1519; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[16],ymm3[16],ymm7[17],ymm3[17],ymm7[18],ymm3[18],ymm7[19],ymm3[19],ymm7[20],ymm3[20],ymm7[21],ymm3[21],ymm7[22],ymm3[22],ymm7[23],ymm3[23] 1520; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] 1521; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15],ymm7[24],ymm3[24],ymm7[25],ymm3[25],ymm7[26],ymm3[26],ymm7[27],ymm3[27],ymm7[28],ymm3[28],ymm7[29],ymm3[29],ymm7[30],ymm3[30],ymm7[31],ymm3[31] 1522; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] 1523; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] 1524; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] 1525; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] 1526; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 1527; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1528; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 1529; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1530; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm2 1531; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm3 1532; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] 1533; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1534; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm7 1535; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1536; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 1537; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1538; AVX512-NEXT: vmovdqa %ymm5, 192(%r8) 1539; AVX512-NEXT: vmovdqa %ymm1, 224(%r8) 1540; AVX512-NEXT: vmovdqa %ymm4, 64(%r8) 1541; AVX512-NEXT: vmovdqa %ymm0, 96(%r8) 1542; AVX512-NEXT: vmovdqa %ymm7, 128(%r8) 1543; AVX512-NEXT: vmovdqa %ymm9, 160(%r8) 1544; AVX512-NEXT: vmovdqa %ymm2, (%r8) 1545; AVX512-NEXT: vmovdqa %ymm3, 32(%r8) 1546; AVX512-NEXT: vzeroupper 1547; AVX512-NEXT: retq 1548; 1549; AVX512-FCP-LABEL: store_i8_stride4_vf64: 1550; AVX512-FCP: # %bb.0: 1551; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0 1552; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 1553; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 1554; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 1555; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] 1556; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] 1557; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] 1558; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] 1559; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm2 1560; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm3 1561; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6 1562; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 1563; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[16],ymm2[16],ymm6[17],ymm2[17],ymm6[18],ymm2[18],ymm6[19],ymm2[19],ymm6[20],ymm2[20],ymm6[21],ymm2[21],ymm6[22],ymm2[22],ymm6[23],ymm2[23] 1564; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[16],ymm3[16],ymm7[17],ymm3[17],ymm7[18],ymm3[18],ymm7[19],ymm3[19],ymm7[20],ymm3[20],ymm7[21],ymm3[21],ymm7[22],ymm3[22],ymm7[23],ymm3[23] 1565; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] 1566; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15],ymm7[24],ymm3[24],ymm7[25],ymm3[25],ymm7[26],ymm3[26],ymm7[27],ymm3[27],ymm7[28],ymm3[28],ymm7[29],ymm3[29],ymm7[30],ymm3[30],ymm7[31],ymm3[31] 1567; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] 1568; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] 1569; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] 1570; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] 1571; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 1572; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1573; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 1574; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1575; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm2 1576; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm3 1577; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] 1578; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1579; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm7 1580; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1581; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 1582; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1583; AVX512-FCP-NEXT: vmovdqa %ymm5, 192(%r8) 1584; AVX512-FCP-NEXT: vmovdqa %ymm1, 224(%r8) 1585; AVX512-FCP-NEXT: vmovdqa %ymm4, 64(%r8) 1586; AVX512-FCP-NEXT: vmovdqa %ymm0, 96(%r8) 1587; AVX512-FCP-NEXT: vmovdqa %ymm7, 128(%r8) 1588; AVX512-FCP-NEXT: vmovdqa %ymm9, 160(%r8) 1589; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) 1590; AVX512-FCP-NEXT: vmovdqa %ymm3, 32(%r8) 1591; AVX512-FCP-NEXT: vzeroupper 1592; AVX512-FCP-NEXT: retq 1593; 1594; AVX512DQ-LABEL: store_i8_stride4_vf64: 1595; AVX512DQ: # %bb.0: 1596; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm0 1597; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 1598; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 1599; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 1600; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] 1601; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] 1602; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] 1603; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] 1604; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 1605; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm3 1606; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm6 1607; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm7 1608; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[16],ymm2[16],ymm6[17],ymm2[17],ymm6[18],ymm2[18],ymm6[19],ymm2[19],ymm6[20],ymm2[20],ymm6[21],ymm2[21],ymm6[22],ymm2[22],ymm6[23],ymm2[23] 1609; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[16],ymm3[16],ymm7[17],ymm3[17],ymm7[18],ymm3[18],ymm7[19],ymm3[19],ymm7[20],ymm3[20],ymm7[21],ymm3[21],ymm7[22],ymm3[22],ymm7[23],ymm3[23] 1610; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] 1611; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15],ymm7[24],ymm3[24],ymm7[25],ymm3[25],ymm7[26],ymm3[26],ymm7[27],ymm3[27],ymm7[28],ymm3[28],ymm7[29],ymm3[29],ymm7[30],ymm3[30],ymm7[31],ymm3[31] 1612; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] 1613; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] 1614; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] 1615; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] 1616; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 1617; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1618; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 1619; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1620; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm2 1621; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm3 1622; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] 1623; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1624; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm7 1625; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1626; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 1627; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1628; AVX512DQ-NEXT: vmovdqa %ymm5, 192(%r8) 1629; AVX512DQ-NEXT: vmovdqa %ymm1, 224(%r8) 1630; AVX512DQ-NEXT: vmovdqa %ymm4, 64(%r8) 1631; AVX512DQ-NEXT: vmovdqa %ymm0, 96(%r8) 1632; AVX512DQ-NEXT: vmovdqa %ymm7, 128(%r8) 1633; AVX512DQ-NEXT: vmovdqa %ymm9, 160(%r8) 1634; AVX512DQ-NEXT: vmovdqa %ymm2, (%r8) 1635; AVX512DQ-NEXT: vmovdqa %ymm3, 32(%r8) 1636; AVX512DQ-NEXT: vzeroupper 1637; AVX512DQ-NEXT: retq 1638; 1639; AVX512DQ-FCP-LABEL: store_i8_stride4_vf64: 1640; AVX512DQ-FCP: # %bb.0: 1641; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0 1642; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 1643; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 1644; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 1645; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] 1646; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] 1647; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] 1648; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] 1649; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 1650; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm3 1651; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm6 1652; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 1653; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[16],ymm2[16],ymm6[17],ymm2[17],ymm6[18],ymm2[18],ymm6[19],ymm2[19],ymm6[20],ymm2[20],ymm6[21],ymm2[21],ymm6[22],ymm2[22],ymm6[23],ymm2[23] 1654; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[16],ymm3[16],ymm7[17],ymm3[17],ymm7[18],ymm3[18],ymm7[19],ymm3[19],ymm7[20],ymm3[20],ymm7[21],ymm3[21],ymm7[22],ymm3[22],ymm7[23],ymm3[23] 1655; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] 1656; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15],ymm7[24],ymm3[24],ymm7[25],ymm3[25],ymm7[26],ymm3[26],ymm7[27],ymm3[27],ymm7[28],ymm3[28],ymm7[29],ymm3[29],ymm7[30],ymm3[30],ymm7[31],ymm3[31] 1657; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] 1658; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] 1659; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] 1660; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] 1661; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 1662; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 1663; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 1664; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 1665; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm2 1666; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm3 1667; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] 1668; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1669; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm7 1670; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1671; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] 1672; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1673; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, 192(%r8) 1674; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 224(%r8) 1675; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, 64(%r8) 1676; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 96(%r8) 1677; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, 128(%r8) 1678; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, 160(%r8) 1679; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) 1680; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 32(%r8) 1681; AVX512DQ-FCP-NEXT: vzeroupper 1682; AVX512DQ-FCP-NEXT: retq 1683; 1684; AVX512BW-LABEL: store_i8_stride4_vf64: 1685; AVX512BW: # %bb.0: 1686; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1687; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 1688; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 1689; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 1690; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 1691; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 1692; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55] 1693; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63] 1694; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27] 1695; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31] 1696; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27] 1697; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] 1698; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1699; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1700; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm6 1701; AVX512BW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 1702; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 1703; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm8 1704; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 1705; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 1706; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1707; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 1708; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 1709; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] 1710; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 1711; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] 1712; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] 1713; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1714; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) 1715; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) 1716; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) 1717; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) 1718; AVX512BW-NEXT: vzeroupper 1719; AVX512BW-NEXT: retq 1720; 1721; AVX512BW-FCP-LABEL: store_i8_stride4_vf64: 1722; AVX512BW-FCP: # %bb.0: 1723; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1724; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 1725; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 1726; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 1727; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 1728; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 1729; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55] 1730; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63] 1731; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27] 1732; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31] 1733; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27] 1734; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] 1735; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1736; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1737; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm1, %xmm6 1738; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm7 1739; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 1740; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm8 1741; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 1742; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 1743; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1744; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 1745; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 1746; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] 1747; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 1748; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] 1749; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] 1750; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1751; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) 1752; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) 1753; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) 1754; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) 1755; AVX512BW-FCP-NEXT: vzeroupper 1756; AVX512BW-FCP-NEXT: retq 1757; 1758; AVX512DQ-BW-LABEL: store_i8_stride4_vf64: 1759; AVX512DQ-BW: # %bb.0: 1760; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1761; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 1762; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 1763; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 1764; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 1765; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 1766; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55] 1767; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63] 1768; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27] 1769; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31] 1770; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27] 1771; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] 1772; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1773; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1774; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, %xmm6 1775; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 1776; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 1777; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, %xmm8 1778; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 1779; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 1780; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1781; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 1782; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 1783; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] 1784; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 1785; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] 1786; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] 1787; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1788; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r8) 1789; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%r8) 1790; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%r8) 1791; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) 1792; AVX512DQ-BW-NEXT: vzeroupper 1793; AVX512DQ-BW-NEXT: retq 1794; 1795; AVX512DQ-BW-FCP-LABEL: store_i8_stride4_vf64: 1796; AVX512DQ-BW-FCP: # %bb.0: 1797; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1798; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 1799; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 1800; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 1801; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 1802; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 1803; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55] 1804; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63] 1805; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27] 1806; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31] 1807; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27] 1808; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] 1809; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1810; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1811; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm1, %xmm6 1812; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm7 1813; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 1814; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm8 1815; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 1816; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 1817; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1818; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 1819; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 1820; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] 1821; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 1822; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] 1823; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] 1824; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] 1825; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) 1826; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) 1827; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) 1828; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) 1829; AVX512DQ-BW-FCP-NEXT: vzeroupper 1830; AVX512DQ-BW-FCP-NEXT: retq 1831 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 1832 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 1833 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 1834 %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64 1835 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1836 %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1837 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> 1838 %interleaved.vec = shufflevector <256 x i8> %3, <256 x i8> poison, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255> 1839 store <256 x i8> %interleaved.vec, ptr %out.vec, align 64 1840 ret void 1841} 1842