1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i8_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i8_stride2_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 23; SSE-NEXT: movd %xmm0, (%rdx) 24; SSE-NEXT: retq 25; 26; AVX-LABEL: store_i8_stride2_vf2: 27; AVX: # %bb.0: 28; AVX-NEXT: vmovdqa (%rdi), %xmm0 29; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 30; AVX-NEXT: vmovd %xmm0, (%rdx) 31; AVX-NEXT: retq 32; 33; AVX2-LABEL: store_i8_stride2_vf2: 34; AVX2: # %bb.0: 35; AVX2-NEXT: vmovdqa (%rdi), %xmm0 36; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 37; AVX2-NEXT: vmovd %xmm0, (%rdx) 38; AVX2-NEXT: retq 39; 40; AVX2-FP-LABEL: store_i8_stride2_vf2: 41; AVX2-FP: # %bb.0: 42; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 43; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 44; AVX2-FP-NEXT: vmovd %xmm0, (%rdx) 45; AVX2-FP-NEXT: retq 46; 47; AVX2-FCP-LABEL: store_i8_stride2_vf2: 48; AVX2-FCP: # %bb.0: 49; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 50; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 51; AVX2-FCP-NEXT: vmovd %xmm0, (%rdx) 52; AVX2-FCP-NEXT: retq 53; 54; AVX512-LABEL: store_i8_stride2_vf2: 55; AVX512: # %bb.0: 56; AVX512-NEXT: vmovdqa (%rdi), %xmm0 57; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 58; AVX512-NEXT: vmovd %xmm0, (%rdx) 59; AVX512-NEXT: retq 60; 61; AVX512-FCP-LABEL: store_i8_stride2_vf2: 62; AVX512-FCP: # %bb.0: 63; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 64; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 65; AVX512-FCP-NEXT: vmovd %xmm0, (%rdx) 66; AVX512-FCP-NEXT: retq 67; 68; AVX512DQ-LABEL: store_i8_stride2_vf2: 69; AVX512DQ: # %bb.0: 70; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 71; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 72; AVX512DQ-NEXT: vmovd %xmm0, (%rdx) 73; AVX512DQ-NEXT: retq 74; 75; AVX512DQ-FCP-LABEL: store_i8_stride2_vf2: 76; AVX512DQ-FCP: # %bb.0: 77; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 78; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 79; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rdx) 80; AVX512DQ-FCP-NEXT: retq 81; 82; AVX512BW-LABEL: store_i8_stride2_vf2: 83; AVX512BW: # %bb.0: 84; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 85; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 86; AVX512BW-NEXT: vmovd %xmm0, (%rdx) 87; AVX512BW-NEXT: retq 88; 89; AVX512BW-FCP-LABEL: store_i8_stride2_vf2: 90; AVX512BW-FCP: # %bb.0: 91; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 92; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 93; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rdx) 94; AVX512BW-FCP-NEXT: retq 95; 96; AVX512DQ-BW-LABEL: store_i8_stride2_vf2: 97; AVX512DQ-BW: # %bb.0: 98; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 99; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 100; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rdx) 101; AVX512DQ-BW-NEXT: retq 102; 103; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf2: 104; AVX512DQ-BW-FCP: # %bb.0: 105; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 106; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 107; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rdx) 108; AVX512DQ-BW-FCP-NEXT: retq 109 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 110 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 111 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 112 %interleaved.vec = shufflevector <4 x i8> %1, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 113 store <4 x i8> %interleaved.vec, ptr %out.vec, align 64 114 ret void 115} 116 117define void @store_i8_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 118; SSE-LABEL: store_i8_stride2_vf4: 119; SSE: # %bb.0: 120; SSE-NEXT: movdqa (%rdi), %xmm0 121; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 122; SSE-NEXT: movq %xmm0, (%rdx) 123; SSE-NEXT: retq 124; 125; AVX-LABEL: store_i8_stride2_vf4: 126; AVX: # %bb.0: 127; AVX-NEXT: vmovdqa (%rdi), %xmm0 128; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 129; AVX-NEXT: vmovq %xmm0, (%rdx) 130; AVX-NEXT: retq 131; 132; AVX2-LABEL: store_i8_stride2_vf4: 133; AVX2: # %bb.0: 134; AVX2-NEXT: vmovdqa (%rdi), %xmm0 135; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 136; AVX2-NEXT: vmovq %xmm0, (%rdx) 137; AVX2-NEXT: retq 138; 139; AVX2-FP-LABEL: store_i8_stride2_vf4: 140; AVX2-FP: # %bb.0: 141; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 142; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 143; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) 144; AVX2-FP-NEXT: retq 145; 146; AVX2-FCP-LABEL: store_i8_stride2_vf4: 147; AVX2-FCP: # %bb.0: 148; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 149; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 150; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) 151; AVX2-FCP-NEXT: retq 152; 153; AVX512-LABEL: store_i8_stride2_vf4: 154; AVX512: # %bb.0: 155; AVX512-NEXT: vmovdqa (%rdi), %xmm0 156; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 157; AVX512-NEXT: vmovq %xmm0, (%rdx) 158; AVX512-NEXT: retq 159; 160; AVX512-FCP-LABEL: store_i8_stride2_vf4: 161; AVX512-FCP: # %bb.0: 162; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 163; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 164; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) 165; AVX512-FCP-NEXT: retq 166; 167; AVX512DQ-LABEL: store_i8_stride2_vf4: 168; AVX512DQ: # %bb.0: 169; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 170; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 171; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) 172; AVX512DQ-NEXT: retq 173; 174; AVX512DQ-FCP-LABEL: store_i8_stride2_vf4: 175; AVX512DQ-FCP: # %bb.0: 176; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 177; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 178; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) 179; AVX512DQ-FCP-NEXT: retq 180; 181; AVX512BW-LABEL: store_i8_stride2_vf4: 182; AVX512BW: # %bb.0: 183; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 184; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 185; AVX512BW-NEXT: vmovq %xmm0, (%rdx) 186; AVX512BW-NEXT: retq 187; 188; AVX512BW-FCP-LABEL: store_i8_stride2_vf4: 189; AVX512BW-FCP: # %bb.0: 190; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 191; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 192; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) 193; AVX512BW-FCP-NEXT: retq 194; 195; AVX512DQ-BW-LABEL: store_i8_stride2_vf4: 196; AVX512DQ-BW: # %bb.0: 197; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 198; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 199; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) 200; AVX512DQ-BW-NEXT: retq 201; 202; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf4: 203; AVX512DQ-BW-FCP: # %bb.0: 204; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 205; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 206; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) 207; AVX512DQ-BW-FCP-NEXT: retq 208 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 209 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 210 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 211 %interleaved.vec = shufflevector <8 x i8> %1, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 212 store <8 x i8> %interleaved.vec, ptr %out.vec, align 64 213 ret void 214} 215 216define void @store_i8_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 217; SSE-LABEL: store_i8_stride2_vf8: 218; SSE: # %bb.0: 219; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 220; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 221; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 222; SSE-NEXT: movdqa %xmm1, (%rdx) 223; SSE-NEXT: retq 224; 225; AVX-LABEL: store_i8_stride2_vf8: 226; AVX: # %bb.0: 227; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 228; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 229; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 230; AVX-NEXT: vmovdqa %xmm0, (%rdx) 231; AVX-NEXT: retq 232; 233; AVX2-LABEL: store_i8_stride2_vf8: 234; AVX2: # %bb.0: 235; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 236; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 237; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 238; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 239; AVX2-NEXT: retq 240; 241; AVX2-FP-LABEL: store_i8_stride2_vf8: 242; AVX2-FP: # %bb.0: 243; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 244; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 245; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 246; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx) 247; AVX2-FP-NEXT: retq 248; 249; AVX2-FCP-LABEL: store_i8_stride2_vf8: 250; AVX2-FCP: # %bb.0: 251; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 252; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 253; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 254; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx) 255; AVX2-FCP-NEXT: retq 256; 257; AVX512-LABEL: store_i8_stride2_vf8: 258; AVX512: # %bb.0: 259; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 260; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 261; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 262; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 263; AVX512-NEXT: retq 264; 265; AVX512-FCP-LABEL: store_i8_stride2_vf8: 266; AVX512-FCP: # %bb.0: 267; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 268; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 269; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 270; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) 271; AVX512-FCP-NEXT: retq 272; 273; AVX512DQ-LABEL: store_i8_stride2_vf8: 274; AVX512DQ: # %bb.0: 275; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 276; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 277; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 278; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 279; AVX512DQ-NEXT: retq 280; 281; AVX512DQ-FCP-LABEL: store_i8_stride2_vf8: 282; AVX512DQ-FCP: # %bb.0: 283; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 284; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 285; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 286; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) 287; AVX512DQ-FCP-NEXT: retq 288; 289; AVX512BW-LABEL: store_i8_stride2_vf8: 290; AVX512BW: # %bb.0: 291; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 292; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 293; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 294; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) 295; AVX512BW-NEXT: retq 296; 297; AVX512BW-FCP-LABEL: store_i8_stride2_vf8: 298; AVX512BW-FCP: # %bb.0: 299; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 300; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 301; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 302; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx) 303; AVX512BW-FCP-NEXT: retq 304; 305; AVX512DQ-BW-LABEL: store_i8_stride2_vf8: 306; AVX512DQ-BW: # %bb.0: 307; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 308; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 309; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 310; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx) 311; AVX512DQ-BW-NEXT: retq 312; 313; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf8: 314; AVX512DQ-BW-FCP: # %bb.0: 315; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 316; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 317; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 318; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx) 319; AVX512DQ-BW-FCP-NEXT: retq 320 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 321 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64 322 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 323 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 324 store <16 x i8> %interleaved.vec, ptr %out.vec, align 64 325 ret void 326} 327 328define void @store_i8_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 329; SSE-LABEL: store_i8_stride2_vf16: 330; SSE: # %bb.0: 331; SSE-NEXT: movdqa (%rdi), %xmm0 332; SSE-NEXT: movdqa (%rsi), %xmm1 333; SSE-NEXT: movdqa %xmm0, %xmm2 334; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 335; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 336; SSE-NEXT: movdqa %xmm0, 16(%rdx) 337; SSE-NEXT: movdqa %xmm2, (%rdx) 338; SSE-NEXT: retq 339; 340; AVX-LABEL: store_i8_stride2_vf16: 341; AVX: # %bb.0: 342; AVX-NEXT: vmovdqa (%rdi), %xmm0 343; AVX-NEXT: vmovdqa (%rsi), %xmm1 344; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 345; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 346; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 347; AVX-NEXT: vmovdqa %xmm2, (%rdx) 348; AVX-NEXT: retq 349; 350; AVX2-LABEL: store_i8_stride2_vf16: 351; AVX2: # %bb.0: 352; AVX2-NEXT: vmovdqa (%rdi), %xmm0 353; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 354; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 355; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 356; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 357; AVX2-NEXT: vzeroupper 358; AVX2-NEXT: retq 359; 360; AVX2-FP-LABEL: store_i8_stride2_vf16: 361; AVX2-FP: # %bb.0: 362; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 363; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 364; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 365; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 366; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx) 367; AVX2-FP-NEXT: vzeroupper 368; AVX2-FP-NEXT: retq 369; 370; AVX2-FCP-LABEL: store_i8_stride2_vf16: 371; AVX2-FCP: # %bb.0: 372; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 373; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 374; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 375; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 376; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx) 377; AVX2-FCP-NEXT: vzeroupper 378; AVX2-FCP-NEXT: retq 379; 380; AVX512-LABEL: store_i8_stride2_vf16: 381; AVX512: # %bb.0: 382; AVX512-NEXT: vmovdqa (%rdi), %xmm0 383; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 384; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 385; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 386; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 387; AVX512-NEXT: vzeroupper 388; AVX512-NEXT: retq 389; 390; AVX512-FCP-LABEL: store_i8_stride2_vf16: 391; AVX512-FCP: # %bb.0: 392; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 393; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 394; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 395; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 396; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx) 397; AVX512-FCP-NEXT: vzeroupper 398; AVX512-FCP-NEXT: retq 399; 400; AVX512DQ-LABEL: store_i8_stride2_vf16: 401; AVX512DQ: # %bb.0: 402; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 403; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 404; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 405; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 406; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 407; AVX512DQ-NEXT: vzeroupper 408; AVX512DQ-NEXT: retq 409; 410; AVX512DQ-FCP-LABEL: store_i8_stride2_vf16: 411; AVX512DQ-FCP: # %bb.0: 412; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 413; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 414; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 415; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 416; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx) 417; AVX512DQ-FCP-NEXT: vzeroupper 418; AVX512DQ-FCP-NEXT: retq 419; 420; AVX512BW-LABEL: store_i8_stride2_vf16: 421; AVX512BW: # %bb.0: 422; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 423; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 424; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 425; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 426; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 427; AVX512BW-NEXT: vzeroupper 428; AVX512BW-NEXT: retq 429; 430; AVX512BW-FCP-LABEL: store_i8_stride2_vf16: 431; AVX512BW-FCP: # %bb.0: 432; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 433; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 434; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 435; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 436; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) 437; AVX512BW-FCP-NEXT: vzeroupper 438; AVX512BW-FCP-NEXT: retq 439; 440; AVX512DQ-BW-LABEL: store_i8_stride2_vf16: 441; AVX512DQ-BW: # %bb.0: 442; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 443; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 444; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 445; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 446; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx) 447; AVX512DQ-BW-NEXT: vzeroupper 448; AVX512DQ-BW-NEXT: retq 449; 450; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf16: 451; AVX512DQ-BW-FCP: # %bb.0: 452; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 453; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 454; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 455; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31] 456; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) 457; AVX512DQ-BW-FCP-NEXT: vzeroupper 458; AVX512DQ-BW-FCP-NEXT: retq 459 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 460 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64 461 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 462 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 463 store <32 x i8> %interleaved.vec, ptr %out.vec, align 64 464 ret void 465} 466 467define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 468; SSE-LABEL: store_i8_stride2_vf32: 469; SSE: # %bb.0: 470; SSE-NEXT: movdqa (%rdi), %xmm0 471; SSE-NEXT: movdqa 16(%rdi), %xmm1 472; SSE-NEXT: movdqa (%rsi), %xmm2 473; SSE-NEXT: movdqa 16(%rsi), %xmm3 474; SSE-NEXT: movdqa %xmm0, %xmm4 475; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 476; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 477; SSE-NEXT: movdqa %xmm1, %xmm2 478; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 479; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 480; SSE-NEXT: movdqa %xmm1, 32(%rdx) 481; SSE-NEXT: movdqa %xmm2, 48(%rdx) 482; SSE-NEXT: movdqa %xmm0, (%rdx) 483; SSE-NEXT: movdqa %xmm4, 16(%rdx) 484; SSE-NEXT: retq 485; 486; AVX-LABEL: store_i8_stride2_vf32: 487; AVX: # %bb.0: 488; AVX-NEXT: vmovdqa (%rsi), %xmm0 489; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 490; AVX-NEXT: vmovdqa (%rdi), %xmm2 491; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 492; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 493; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 494; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 495; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 496; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) 497; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) 498; AVX-NEXT: vmovdqa %xmm0, (%rdx) 499; AVX-NEXT: vmovdqa %xmm4, 16(%rdx) 500; AVX-NEXT: retq 501; 502; AVX2-LABEL: store_i8_stride2_vf32: 503; AVX2: # %bb.0: 504; AVX2-NEXT: vmovdqa (%rdi), %ymm0 505; AVX2-NEXT: vmovdqa (%rsi), %ymm1 506; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 507; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 508; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] 509; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 510; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 511; AVX2-NEXT: vmovdqa %ymm1, (%rdx) 512; AVX2-NEXT: vzeroupper 513; AVX2-NEXT: retq 514; 515; AVX2-FP-LABEL: store_i8_stride2_vf32: 516; AVX2-FP: # %bb.0: 517; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 518; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 519; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 520; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 521; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] 522; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 523; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx) 524; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx) 525; AVX2-FP-NEXT: vzeroupper 526; AVX2-FP-NEXT: retq 527; 528; AVX2-FCP-LABEL: store_i8_stride2_vf32: 529; AVX2-FCP: # %bb.0: 530; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 531; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 532; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 533; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 534; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] 535; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 536; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx) 537; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx) 538; AVX2-FCP-NEXT: vzeroupper 539; AVX2-FCP-NEXT: retq 540; 541; AVX512-LABEL: store_i8_stride2_vf32: 542; AVX512: # %bb.0: 543; AVX512-NEXT: vmovdqa (%rsi), %xmm0 544; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1 545; AVX512-NEXT: vmovdqa (%rdi), %xmm2 546; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 547; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 548; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 549; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 550; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 551; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) 552; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx) 553; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 554; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx) 555; AVX512-NEXT: retq 556; 557; AVX512-FCP-LABEL: store_i8_stride2_vf32: 558; AVX512-FCP: # %bb.0: 559; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 560; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 561; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 562; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 563; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 564; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 565; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 566; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 567; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 568; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) 569; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) 570; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) 571; AVX512-FCP-NEXT: retq 572; 573; AVX512DQ-LABEL: store_i8_stride2_vf32: 574; AVX512DQ: # %bb.0: 575; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 576; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 577; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 578; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 579; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 580; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 581; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 582; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 583; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) 584; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx) 585; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 586; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx) 587; AVX512DQ-NEXT: retq 588; 589; AVX512DQ-FCP-LABEL: store_i8_stride2_vf32: 590; AVX512DQ-FCP: # %bb.0: 591; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 592; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 593; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 594; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 595; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 596; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 597; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 598; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 599; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 600; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) 601; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) 602; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) 603; AVX512DQ-FCP-NEXT: retq 604; 605; AVX512BW-LABEL: store_i8_stride2_vf32: 606; AVX512BW: # %bb.0: 607; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 608; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 609; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] 610; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 611; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] 612; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 613; AVX512BW-NEXT: vzeroupper 614; AVX512BW-NEXT: retq 615; 616; AVX512BW-FCP-LABEL: store_i8_stride2_vf32: 617; AVX512BW-FCP: # %bb.0: 618; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 619; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 620; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] 621; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0 622; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] 623; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 624; AVX512BW-FCP-NEXT: vzeroupper 625; AVX512BW-FCP-NEXT: retq 626; 627; AVX512DQ-BW-LABEL: store_i8_stride2_vf32: 628; AVX512DQ-BW: # %bb.0: 629; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 630; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 631; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] 632; AVX512DQ-BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 633; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] 634; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) 635; AVX512DQ-BW-NEXT: vzeroupper 636; AVX512DQ-BW-NEXT: retq 637; 638; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf32: 639; AVX512DQ-BW-FCP: # %bb.0: 640; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 641; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 642; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] 643; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0 644; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] 645; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 646; AVX512DQ-BW-FCP-NEXT: vzeroupper 647; AVX512DQ-BW-FCP-NEXT: retq 648 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 649 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64 650 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 651 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 652 store <64 x i8> %interleaved.vec, ptr %out.vec, align 64 653 ret void 654} 655 656define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 657; SSE-LABEL: store_i8_stride2_vf64: 658; SSE: # %bb.0: 659; SSE-NEXT: movdqa (%rdi), %xmm0 660; SSE-NEXT: movdqa 16(%rdi), %xmm1 661; SSE-NEXT: movdqa 32(%rdi), %xmm2 662; SSE-NEXT: movdqa 48(%rdi), %xmm3 663; SSE-NEXT: movdqa (%rsi), %xmm4 664; SSE-NEXT: movdqa 16(%rsi), %xmm5 665; SSE-NEXT: movdqa 32(%rsi), %xmm6 666; SSE-NEXT: movdqa 48(%rsi), %xmm7 667; SSE-NEXT: movdqa %xmm0, %xmm8 668; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] 669; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 670; SSE-NEXT: movdqa %xmm1, %xmm4 671; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 672; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 673; SSE-NEXT: movdqa %xmm2, %xmm5 674; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] 675; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 676; SSE-NEXT: movdqa %xmm3, %xmm6 677; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 678; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 679; SSE-NEXT: movdqa %xmm3, 96(%rdx) 680; SSE-NEXT: movdqa %xmm6, 112(%rdx) 681; SSE-NEXT: movdqa %xmm2, 64(%rdx) 682; SSE-NEXT: movdqa %xmm5, 80(%rdx) 683; SSE-NEXT: movdqa %xmm1, 32(%rdx) 684; SSE-NEXT: movdqa %xmm4, 48(%rdx) 685; SSE-NEXT: movdqa %xmm0, (%rdx) 686; SSE-NEXT: movdqa %xmm8, 16(%rdx) 687; SSE-NEXT: retq 688; 689; AVX-LABEL: store_i8_stride2_vf64: 690; AVX: # %bb.0: 691; AVX-NEXT: vmovdqa (%rsi), %xmm0 692; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 693; AVX-NEXT: vmovdqa 32(%rsi), %xmm2 694; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 695; AVX-NEXT: vmovdqa (%rdi), %xmm4 696; AVX-NEXT: vmovdqa 16(%rdi), %xmm5 697; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 698; AVX-NEXT: vmovdqa 48(%rdi), %xmm7 699; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 700; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 701; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 702; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 703; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 704; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 705; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 706; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 707; AVX-NEXT: vmovdqa %xmm0, (%rdx) 708; AVX-NEXT: vmovdqa %xmm5, 16(%rdx) 709; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) 710; AVX-NEXT: vmovdqa %xmm7, 48(%rdx) 711; AVX-NEXT: vmovdqa %xmm3, 96(%rdx) 712; AVX-NEXT: vmovdqa %xmm6, 112(%rdx) 713; AVX-NEXT: vmovdqa %xmm2, 64(%rdx) 714; AVX-NEXT: vmovdqa %xmm8, 80(%rdx) 715; AVX-NEXT: retq 716; 717; AVX2-LABEL: store_i8_stride2_vf64: 718; AVX2: # %bb.0: 719; AVX2-NEXT: vmovdqa (%rdi), %ymm0 720; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 721; AVX2-NEXT: vmovdqa (%rsi), %ymm2 722; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 723; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 724; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 725; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] 726; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] 727; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 728; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 729; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] 730; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 731; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx) 732; AVX2-NEXT: vmovdqa %ymm3, 96(%rdx) 733; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 734; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx) 735; AVX2-NEXT: vzeroupper 736; AVX2-NEXT: retq 737; 738; AVX2-FP-LABEL: store_i8_stride2_vf64: 739; AVX2-FP: # %bb.0: 740; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 741; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 742; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 743; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3 744; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 745; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 746; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] 747; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] 748; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 749; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 750; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] 751; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 752; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx) 753; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rdx) 754; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx) 755; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rdx) 756; AVX2-FP-NEXT: vzeroupper 757; AVX2-FP-NEXT: retq 758; 759; AVX2-FCP-LABEL: store_i8_stride2_vf64: 760; AVX2-FCP: # %bb.0: 761; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 762; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 763; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 764; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 765; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 766; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 767; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] 768; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] 769; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 770; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 771; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] 772; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 773; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx) 774; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rdx) 775; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx) 776; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rdx) 777; AVX2-FCP-NEXT: vzeroupper 778; AVX2-FCP-NEXT: retq 779; 780; AVX512-LABEL: store_i8_stride2_vf64: 781; AVX512: # %bb.0: 782; AVX512-NEXT: vmovdqa (%rsi), %xmm0 783; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1 784; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2 785; AVX512-NEXT: vmovdqa 48(%rsi), %xmm3 786; AVX512-NEXT: vmovdqa (%rdi), %xmm4 787; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5 788; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6 789; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7 790; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 791; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 792; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 793; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 794; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 795; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 796; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 797; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 798; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx) 799; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx) 800; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx) 801; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx) 802; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) 803; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx) 804; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 805; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx) 806; AVX512-NEXT: retq 807; 808; AVX512-FCP-LABEL: store_i8_stride2_vf64: 809; AVX512-FCP: # %bb.0: 810; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 811; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 812; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 813; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm3 814; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 815; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 816; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 817; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 818; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 819; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 820; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 821; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 822; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 823; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 824; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 825; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 826; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) 827; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) 828; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) 829; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) 830; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 831; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) 832; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) 833; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) 834; AVX512-FCP-NEXT: retq 835; 836; AVX512DQ-LABEL: store_i8_stride2_vf64: 837; AVX512DQ: # %bb.0: 838; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 839; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 840; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2 841; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm3 842; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 843; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 844; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6 845; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7 846; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 847; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 848; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 849; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 850; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 851; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 852; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 853; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 854; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx) 855; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx) 856; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx) 857; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx) 858; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) 859; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx) 860; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 861; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx) 862; AVX512DQ-NEXT: retq 863; 864; AVX512DQ-FCP-LABEL: store_i8_stride2_vf64: 865; AVX512DQ-FCP: # %bb.0: 866; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 867; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 868; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 869; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm3 870; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 871; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 872; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 873; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 874; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 875; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 876; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 877; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 878; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 879; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 880; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 881; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 882; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) 883; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) 884; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) 885; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) 886; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 887; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) 888; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) 889; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) 890; AVX512DQ-FCP-NEXT: retq 891; 892; AVX512BW-LABEL: store_i8_stride2_vf64: 893; AVX512BW: # %bb.0: 894; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 895; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm1 896; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm2 897; AVX512BW-NEXT: vmovdqa 48(%rsi), %xmm3 898; AVX512BW-NEXT: vmovdqa (%rdi), %xmm4 899; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 900; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6 901; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm7 902; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 903; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 904; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 905; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 906; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 907; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 908; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 909; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 910; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 911; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 912; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 913; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 914; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 915; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 916; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 917; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) 918; AVX512BW-NEXT: vzeroupper 919; AVX512BW-NEXT: retq 920; 921; AVX512BW-FCP-LABEL: store_i8_stride2_vf64: 922; AVX512BW-FCP: # %bb.0: 923; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 924; AVX512BW-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 925; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 926; AVX512BW-FCP-NEXT: vmovdqa 48(%rsi), %xmm3 927; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 928; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 929; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 930; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 931; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 932; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 933; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 934; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 935; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 936; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 937; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 938; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 939; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 940; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 941; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 942; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 943; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 944; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 945; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 946; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx) 947; AVX512BW-FCP-NEXT: vzeroupper 948; AVX512BW-FCP-NEXT: retq 949; 950; AVX512DQ-BW-LABEL: store_i8_stride2_vf64: 951; AVX512DQ-BW: # %bb.0: 952; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 953; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm1 954; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm2 955; AVX512DQ-BW-NEXT: vmovdqa 48(%rsi), %xmm3 956; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm4 957; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm5 958; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm6 959; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm7 960; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 961; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 962; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 963; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 964; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 965; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 966; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 967; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 968; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 969; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 970; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 971; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 972; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 973; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 974; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) 975; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) 976; AVX512DQ-BW-NEXT: vzeroupper 977; AVX512DQ-BW-NEXT: retq 978; 979; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf64: 980; AVX512DQ-BW-FCP: # %bb.0: 981; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 982; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 983; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 984; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rsi), %xmm3 985; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 986; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 987; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 988; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 989; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 990; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 991; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 992; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] 993; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 994; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 995; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 996; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 997; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 998; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 999; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 1000; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 1001; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 1002; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1003; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 1004; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx) 1005; AVX512DQ-BW-FCP-NEXT: vzeroupper 1006; AVX512DQ-BW-FCP-NEXT: retq 1007 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 1008 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 1009 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1010 %interleaved.vec = shufflevector <128 x i8> %1, <128 x i8> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 1011 store <128 x i8> %interleaved.vec, ptr %out.vec, align 64 1012 ret void 1013} 1014