xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll (revision b5d35feacb7246573c6a4ab2bddc4919a4228ed5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved stores.
17
18define void @store_i8_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19; SSE-LABEL: store_i8_stride2_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movdqa (%rdi), %xmm0
22; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
23; SSE-NEXT:    movd %xmm0, (%rdx)
24; SSE-NEXT:    retq
25;
26; AVX-LABEL: store_i8_stride2_vf2:
27; AVX:       # %bb.0:
28; AVX-NEXT:    vmovdqa (%rdi), %xmm0
29; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
30; AVX-NEXT:    vmovd %xmm0, (%rdx)
31; AVX-NEXT:    retq
32;
33; AVX2-LABEL: store_i8_stride2_vf2:
34; AVX2:       # %bb.0:
35; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
36; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
37; AVX2-NEXT:    vmovd %xmm0, (%rdx)
38; AVX2-NEXT:    retq
39;
40; AVX2-FP-LABEL: store_i8_stride2_vf2:
41; AVX2-FP:       # %bb.0:
42; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
43; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
44; AVX2-FP-NEXT:    vmovd %xmm0, (%rdx)
45; AVX2-FP-NEXT:    retq
46;
47; AVX2-FCP-LABEL: store_i8_stride2_vf2:
48; AVX2-FCP:       # %bb.0:
49; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
50; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
51; AVX2-FCP-NEXT:    vmovd %xmm0, (%rdx)
52; AVX2-FCP-NEXT:    retq
53;
54; AVX512-LABEL: store_i8_stride2_vf2:
55; AVX512:       # %bb.0:
56; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
57; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
58; AVX512-NEXT:    vmovd %xmm0, (%rdx)
59; AVX512-NEXT:    retq
60;
61; AVX512-FCP-LABEL: store_i8_stride2_vf2:
62; AVX512-FCP:       # %bb.0:
63; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
64; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
65; AVX512-FCP-NEXT:    vmovd %xmm0, (%rdx)
66; AVX512-FCP-NEXT:    retq
67;
68; AVX512DQ-LABEL: store_i8_stride2_vf2:
69; AVX512DQ:       # %bb.0:
70; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
71; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
72; AVX512DQ-NEXT:    vmovd %xmm0, (%rdx)
73; AVX512DQ-NEXT:    retq
74;
75; AVX512DQ-FCP-LABEL: store_i8_stride2_vf2:
76; AVX512DQ-FCP:       # %bb.0:
77; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
78; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
79; AVX512DQ-FCP-NEXT:    vmovd %xmm0, (%rdx)
80; AVX512DQ-FCP-NEXT:    retq
81;
82; AVX512BW-LABEL: store_i8_stride2_vf2:
83; AVX512BW:       # %bb.0:
84; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
85; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
86; AVX512BW-NEXT:    vmovd %xmm0, (%rdx)
87; AVX512BW-NEXT:    retq
88;
89; AVX512BW-FCP-LABEL: store_i8_stride2_vf2:
90; AVX512BW-FCP:       # %bb.0:
91; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
92; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
93; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rdx)
94; AVX512BW-FCP-NEXT:    retq
95;
96; AVX512DQ-BW-LABEL: store_i8_stride2_vf2:
97; AVX512DQ-BW:       # %bb.0:
98; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
99; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
100; AVX512DQ-BW-NEXT:    vmovd %xmm0, (%rdx)
101; AVX512DQ-BW-NEXT:    retq
102;
103; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf2:
104; AVX512DQ-BW-FCP:       # %bb.0:
105; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
106; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
107; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rdx)
108; AVX512DQ-BW-FCP-NEXT:    retq
109  %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64
110  %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64
111  %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
112  %interleaved.vec = shufflevector <4 x i8> %1, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
113  store <4 x i8> %interleaved.vec, ptr %out.vec, align 64
114  ret void
115}
116
117define void @store_i8_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
118; SSE-LABEL: store_i8_stride2_vf4:
119; SSE:       # %bb.0:
120; SSE-NEXT:    movdqa (%rdi), %xmm0
121; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
122; SSE-NEXT:    movq %xmm0, (%rdx)
123; SSE-NEXT:    retq
124;
125; AVX-LABEL: store_i8_stride2_vf4:
126; AVX:       # %bb.0:
127; AVX-NEXT:    vmovdqa (%rdi), %xmm0
128; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
129; AVX-NEXT:    vmovq %xmm0, (%rdx)
130; AVX-NEXT:    retq
131;
132; AVX2-LABEL: store_i8_stride2_vf4:
133; AVX2:       # %bb.0:
134; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
135; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
136; AVX2-NEXT:    vmovq %xmm0, (%rdx)
137; AVX2-NEXT:    retq
138;
139; AVX2-FP-LABEL: store_i8_stride2_vf4:
140; AVX2-FP:       # %bb.0:
141; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
142; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
143; AVX2-FP-NEXT:    vmovq %xmm0, (%rdx)
144; AVX2-FP-NEXT:    retq
145;
146; AVX2-FCP-LABEL: store_i8_stride2_vf4:
147; AVX2-FCP:       # %bb.0:
148; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
149; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
150; AVX2-FCP-NEXT:    vmovq %xmm0, (%rdx)
151; AVX2-FCP-NEXT:    retq
152;
153; AVX512-LABEL: store_i8_stride2_vf4:
154; AVX512:       # %bb.0:
155; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
156; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
157; AVX512-NEXT:    vmovq %xmm0, (%rdx)
158; AVX512-NEXT:    retq
159;
160; AVX512-FCP-LABEL: store_i8_stride2_vf4:
161; AVX512-FCP:       # %bb.0:
162; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
163; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
164; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
165; AVX512-FCP-NEXT:    retq
166;
167; AVX512DQ-LABEL: store_i8_stride2_vf4:
168; AVX512DQ:       # %bb.0:
169; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
170; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
171; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
172; AVX512DQ-NEXT:    retq
173;
174; AVX512DQ-FCP-LABEL: store_i8_stride2_vf4:
175; AVX512DQ-FCP:       # %bb.0:
176; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
177; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
178; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
179; AVX512DQ-FCP-NEXT:    retq
180;
181; AVX512BW-LABEL: store_i8_stride2_vf4:
182; AVX512BW:       # %bb.0:
183; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
184; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
185; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
186; AVX512BW-NEXT:    retq
187;
188; AVX512BW-FCP-LABEL: store_i8_stride2_vf4:
189; AVX512BW-FCP:       # %bb.0:
190; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
191; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
192; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
193; AVX512BW-FCP-NEXT:    retq
194;
195; AVX512DQ-BW-LABEL: store_i8_stride2_vf4:
196; AVX512DQ-BW:       # %bb.0:
197; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
198; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
199; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
200; AVX512DQ-BW-NEXT:    retq
201;
202; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf4:
203; AVX512DQ-BW-FCP:       # %bb.0:
204; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
205; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
206; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
207; AVX512DQ-BW-FCP-NEXT:    retq
208  %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64
209  %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64
210  %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
211  %interleaved.vec = shufflevector <8 x i8> %1, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
212  store <8 x i8> %interleaved.vec, ptr %out.vec, align 64
213  ret void
214}
215
216define void @store_i8_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
217; SSE-LABEL: store_i8_stride2_vf8:
218; SSE:       # %bb.0:
219; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
220; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
221; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
222; SSE-NEXT:    movdqa %xmm1, (%rdx)
223; SSE-NEXT:    retq
224;
225; AVX-LABEL: store_i8_stride2_vf8:
226; AVX:       # %bb.0:
227; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
228; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
229; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
230; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
231; AVX-NEXT:    retq
232;
233; AVX2-LABEL: store_i8_stride2_vf8:
234; AVX2:       # %bb.0:
235; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
236; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
237; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
238; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
239; AVX2-NEXT:    retq
240;
241; AVX2-FP-LABEL: store_i8_stride2_vf8:
242; AVX2-FP:       # %bb.0:
243; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
244; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
245; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
246; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rdx)
247; AVX2-FP-NEXT:    retq
248;
249; AVX2-FCP-LABEL: store_i8_stride2_vf8:
250; AVX2-FCP:       # %bb.0:
251; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
252; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
253; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
254; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
255; AVX2-FCP-NEXT:    retq
256;
257; AVX512-LABEL: store_i8_stride2_vf8:
258; AVX512:       # %bb.0:
259; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
260; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
261; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
262; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
263; AVX512-NEXT:    retq
264;
265; AVX512-FCP-LABEL: store_i8_stride2_vf8:
266; AVX512-FCP:       # %bb.0:
267; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
268; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
269; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
270; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
271; AVX512-FCP-NEXT:    retq
272;
273; AVX512DQ-LABEL: store_i8_stride2_vf8:
274; AVX512DQ:       # %bb.0:
275; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
276; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
277; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
278; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
279; AVX512DQ-NEXT:    retq
280;
281; AVX512DQ-FCP-LABEL: store_i8_stride2_vf8:
282; AVX512DQ-FCP:       # %bb.0:
283; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
284; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
285; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
286; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
287; AVX512DQ-FCP-NEXT:    retq
288;
289; AVX512BW-LABEL: store_i8_stride2_vf8:
290; AVX512BW:       # %bb.0:
291; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
292; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
293; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
294; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
295; AVX512BW-NEXT:    retq
296;
297; AVX512BW-FCP-LABEL: store_i8_stride2_vf8:
298; AVX512BW-FCP:       # %bb.0:
299; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
300; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
301; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
302; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
303; AVX512BW-FCP-NEXT:    retq
304;
305; AVX512DQ-BW-LABEL: store_i8_stride2_vf8:
306; AVX512DQ-BW:       # %bb.0:
307; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
308; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
309; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
310; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rdx)
311; AVX512DQ-BW-NEXT:    retq
312;
313; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf8:
314; AVX512DQ-BW-FCP:       # %bb.0:
315; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
316; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
317; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
318; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
319; AVX512DQ-BW-FCP-NEXT:    retq
320  %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
321  %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
322  %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
323  %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
324  store <16 x i8> %interleaved.vec, ptr %out.vec, align 64
325  ret void
326}
327
328define void @store_i8_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
329; SSE-LABEL: store_i8_stride2_vf16:
330; SSE:       # %bb.0:
331; SSE-NEXT:    movdqa (%rdi), %xmm0
332; SSE-NEXT:    movdqa (%rsi), %xmm1
333; SSE-NEXT:    movdqa %xmm0, %xmm2
334; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
335; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
336; SSE-NEXT:    movdqa %xmm0, 16(%rdx)
337; SSE-NEXT:    movdqa %xmm2, (%rdx)
338; SSE-NEXT:    retq
339;
340; AVX-LABEL: store_i8_stride2_vf16:
341; AVX:       # %bb.0:
342; AVX-NEXT:    vmovdqa (%rdi), %xmm0
343; AVX-NEXT:    vmovdqa (%rsi), %xmm1
344; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
345; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
346; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
347; AVX-NEXT:    vmovdqa %xmm2, (%rdx)
348; AVX-NEXT:    retq
349;
350; AVX2-LABEL: store_i8_stride2_vf16:
351; AVX2:       # %bb.0:
352; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
353; AVX2-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
354; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
355; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
356; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
357; AVX2-NEXT:    vzeroupper
358; AVX2-NEXT:    retq
359;
360; AVX2-FP-LABEL: store_i8_stride2_vf16:
361; AVX2-FP:       # %bb.0:
362; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
363; AVX2-FP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
364; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
365; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
366; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rdx)
367; AVX2-FP-NEXT:    vzeroupper
368; AVX2-FP-NEXT:    retq
369;
370; AVX2-FCP-LABEL: store_i8_stride2_vf16:
371; AVX2-FCP:       # %bb.0:
372; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
373; AVX2-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
374; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
375; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
376; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
377; AVX2-FCP-NEXT:    vzeroupper
378; AVX2-FCP-NEXT:    retq
379;
380; AVX512-LABEL: store_i8_stride2_vf16:
381; AVX512:       # %bb.0:
382; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
383; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
384; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
385; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
386; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
387; AVX512-NEXT:    vzeroupper
388; AVX512-NEXT:    retq
389;
390; AVX512-FCP-LABEL: store_i8_stride2_vf16:
391; AVX512-FCP:       # %bb.0:
392; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
393; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
394; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
395; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
396; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
397; AVX512-FCP-NEXT:    vzeroupper
398; AVX512-FCP-NEXT:    retq
399;
400; AVX512DQ-LABEL: store_i8_stride2_vf16:
401; AVX512DQ:       # %bb.0:
402; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
403; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
404; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
405; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
406; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
407; AVX512DQ-NEXT:    vzeroupper
408; AVX512DQ-NEXT:    retq
409;
410; AVX512DQ-FCP-LABEL: store_i8_stride2_vf16:
411; AVX512DQ-FCP:       # %bb.0:
412; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
413; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
414; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
415; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
416; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
417; AVX512DQ-FCP-NEXT:    vzeroupper
418; AVX512DQ-FCP-NEXT:    retq
419;
420; AVX512BW-LABEL: store_i8_stride2_vf16:
421; AVX512BW:       # %bb.0:
422; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
423; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
424; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
425; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
426; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
427; AVX512BW-NEXT:    vzeroupper
428; AVX512BW-NEXT:    retq
429;
430; AVX512BW-FCP-LABEL: store_i8_stride2_vf16:
431; AVX512BW-FCP:       # %bb.0:
432; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
433; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
434; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
435; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
436; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
437; AVX512BW-FCP-NEXT:    vzeroupper
438; AVX512BW-FCP-NEXT:    retq
439;
440; AVX512DQ-BW-LABEL: store_i8_stride2_vf16:
441; AVX512DQ-BW:       # %bb.0:
442; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
443; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
444; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
445; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
446; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rdx)
447; AVX512DQ-BW-NEXT:    vzeroupper
448; AVX512DQ-BW-NEXT:    retq
449;
450; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf16:
451; AVX512DQ-BW-FCP:       # %bb.0:
452; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
453; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
454; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
455; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
456; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
457; AVX512DQ-BW-FCP-NEXT:    vzeroupper
458; AVX512DQ-BW-FCP-NEXT:    retq
459  %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
460  %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
461  %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
462  %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
463  store <32 x i8> %interleaved.vec, ptr %out.vec, align 64
464  ret void
465}
466
467define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
468; SSE-LABEL: store_i8_stride2_vf32:
469; SSE:       # %bb.0:
470; SSE-NEXT:    movdqa (%rdi), %xmm0
471; SSE-NEXT:    movdqa 16(%rdi), %xmm1
472; SSE-NEXT:    movdqa (%rsi), %xmm2
473; SSE-NEXT:    movdqa 16(%rsi), %xmm3
474; SSE-NEXT:    movdqa %xmm0, %xmm4
475; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
476; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
477; SSE-NEXT:    movdqa %xmm1, %xmm2
478; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
479; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
480; SSE-NEXT:    movdqa %xmm1, 32(%rdx)
481; SSE-NEXT:    movdqa %xmm2, 48(%rdx)
482; SSE-NEXT:    movdqa %xmm0, (%rdx)
483; SSE-NEXT:    movdqa %xmm4, 16(%rdx)
484; SSE-NEXT:    retq
485;
486; AVX-LABEL: store_i8_stride2_vf32:
487; AVX:       # %bb.0:
488; AVX-NEXT:    vmovdqa (%rsi), %xmm0
489; AVX-NEXT:    vmovdqa 16(%rsi), %xmm1
490; AVX-NEXT:    vmovdqa (%rdi), %xmm2
491; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
492; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
493; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
494; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
495; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
496; AVX-NEXT:    vmovdqa %xmm1, 48(%rdx)
497; AVX-NEXT:    vmovdqa %xmm2, 32(%rdx)
498; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
499; AVX-NEXT:    vmovdqa %xmm4, 16(%rdx)
500; AVX-NEXT:    retq
501;
502; AVX2-LABEL: store_i8_stride2_vf32:
503; AVX2:       # %bb.0:
504; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
505; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
506; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
507; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
508; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
509; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
510; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
511; AVX2-NEXT:    vmovdqa %ymm1, (%rdx)
512; AVX2-NEXT:    vzeroupper
513; AVX2-NEXT:    retq
514;
515; AVX2-FP-LABEL: store_i8_stride2_vf32:
516; AVX2-FP:       # %bb.0:
517; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
518; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm1
519; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
520; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
521; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
522; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
523; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rdx)
524; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rdx)
525; AVX2-FP-NEXT:    vzeroupper
526; AVX2-FP-NEXT:    retq
527;
528; AVX2-FCP-LABEL: store_i8_stride2_vf32:
529; AVX2-FCP:       # %bb.0:
530; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
531; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm1
532; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
533; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
534; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
535; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
536; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%rdx)
537; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
538; AVX2-FCP-NEXT:    vzeroupper
539; AVX2-FCP-NEXT:    retq
540;
541; AVX512-LABEL: store_i8_stride2_vf32:
542; AVX512:       # %bb.0:
543; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
544; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm1
545; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
546; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm3
547; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
548; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
549; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
550; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
551; AVX512-NEXT:    vmovdqa %xmm1, 32(%rdx)
552; AVX512-NEXT:    vmovdqa %xmm2, 48(%rdx)
553; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
554; AVX512-NEXT:    vmovdqa %xmm4, 16(%rdx)
555; AVX512-NEXT:    retq
556;
557; AVX512-FCP-LABEL: store_i8_stride2_vf32:
558; AVX512-FCP:       # %bb.0:
559; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
560; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm1
561; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm2
562; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
563; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
564; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
565; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
566; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
567; AVX512-FCP-NEXT:    vmovdqa %xmm1, 32(%rdx)
568; AVX512-FCP-NEXT:    vmovdqa %xmm2, 48(%rdx)
569; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
570; AVX512-FCP-NEXT:    vmovdqa %xmm4, 16(%rdx)
571; AVX512-FCP-NEXT:    retq
572;
573; AVX512DQ-LABEL: store_i8_stride2_vf32:
574; AVX512DQ:       # %bb.0:
575; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm0
576; AVX512DQ-NEXT:    vmovdqa 16(%rsi), %xmm1
577; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm2
578; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm3
579; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
580; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
581; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
582; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
583; AVX512DQ-NEXT:    vmovdqa %xmm1, 32(%rdx)
584; AVX512DQ-NEXT:    vmovdqa %xmm2, 48(%rdx)
585; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
586; AVX512DQ-NEXT:    vmovdqa %xmm4, 16(%rdx)
587; AVX512DQ-NEXT:    retq
588;
589; AVX512DQ-FCP-LABEL: store_i8_stride2_vf32:
590; AVX512DQ-FCP:       # %bb.0:
591; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm0
592; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rsi), %xmm1
593; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm2
594; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
595; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
596; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
597; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
598; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
599; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, 32(%rdx)
600; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, 48(%rdx)
601; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
602; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, 16(%rdx)
603; AVX512DQ-FCP-NEXT:    retq
604;
605; AVX512BW-LABEL: store_i8_stride2_vf32:
606; AVX512BW:       # %bb.0:
607; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
608; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
609; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
610; AVX512BW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
611; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
612; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
613; AVX512BW-NEXT:    vzeroupper
614; AVX512BW-NEXT:    retq
615;
616; AVX512BW-FCP-LABEL: store_i8_stride2_vf32:
617; AVX512BW-FCP:       # %bb.0:
618; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
619; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
620; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
621; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm1, %zmm0
622; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
623; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rdx)
624; AVX512BW-FCP-NEXT:    vzeroupper
625; AVX512BW-FCP-NEXT:    retq
626;
627; AVX512DQ-BW-LABEL: store_i8_stride2_vf32:
628; AVX512DQ-BW:       # %bb.0:
629; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
630; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
631; AVX512DQ-BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
632; AVX512DQ-BW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
633; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
634; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
635; AVX512DQ-BW-NEXT:    vzeroupper
636; AVX512DQ-BW-NEXT:    retq
637;
638; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf32:
639; AVX512DQ-BW-FCP:       # %bb.0:
640; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
641; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
642; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
643; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm1, %zmm0
644; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
645; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rdx)
646; AVX512DQ-BW-FCP-NEXT:    vzeroupper
647; AVX512DQ-BW-FCP-NEXT:    retq
648  %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
649  %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64
650  %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
651  %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
652  store <64 x i8> %interleaved.vec, ptr %out.vec, align 64
653  ret void
654}
655
656define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
657; SSE-LABEL: store_i8_stride2_vf64:
658; SSE:       # %bb.0:
659; SSE-NEXT:    movdqa (%rdi), %xmm0
660; SSE-NEXT:    movdqa 16(%rdi), %xmm1
661; SSE-NEXT:    movdqa 32(%rdi), %xmm2
662; SSE-NEXT:    movdqa 48(%rdi), %xmm3
663; SSE-NEXT:    movdqa (%rsi), %xmm4
664; SSE-NEXT:    movdqa 16(%rsi), %xmm5
665; SSE-NEXT:    movdqa 32(%rsi), %xmm6
666; SSE-NEXT:    movdqa 48(%rsi), %xmm7
667; SSE-NEXT:    movdqa %xmm0, %xmm8
668; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
669; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
670; SSE-NEXT:    movdqa %xmm1, %xmm4
671; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
672; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
673; SSE-NEXT:    movdqa %xmm2, %xmm5
674; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
675; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
676; SSE-NEXT:    movdqa %xmm3, %xmm6
677; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
678; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
679; SSE-NEXT:    movdqa %xmm3, 96(%rdx)
680; SSE-NEXT:    movdqa %xmm6, 112(%rdx)
681; SSE-NEXT:    movdqa %xmm2, 64(%rdx)
682; SSE-NEXT:    movdqa %xmm5, 80(%rdx)
683; SSE-NEXT:    movdqa %xmm1, 32(%rdx)
684; SSE-NEXT:    movdqa %xmm4, 48(%rdx)
685; SSE-NEXT:    movdqa %xmm0, (%rdx)
686; SSE-NEXT:    movdqa %xmm8, 16(%rdx)
687; SSE-NEXT:    retq
688;
689; AVX-LABEL: store_i8_stride2_vf64:
690; AVX:       # %bb.0:
691; AVX-NEXT:    vmovdqa (%rsi), %xmm0
692; AVX-NEXT:    vmovdqa 16(%rsi), %xmm1
693; AVX-NEXT:    vmovdqa 32(%rsi), %xmm2
694; AVX-NEXT:    vmovdqa 48(%rsi), %xmm3
695; AVX-NEXT:    vmovdqa (%rdi), %xmm4
696; AVX-NEXT:    vmovdqa 16(%rdi), %xmm5
697; AVX-NEXT:    vmovdqa 32(%rdi), %xmm6
698; AVX-NEXT:    vmovdqa 48(%rdi), %xmm7
699; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
700; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
701; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
702; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
703; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
704; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
705; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
706; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
707; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
708; AVX-NEXT:    vmovdqa %xmm5, 16(%rdx)
709; AVX-NEXT:    vmovdqa %xmm1, 32(%rdx)
710; AVX-NEXT:    vmovdqa %xmm7, 48(%rdx)
711; AVX-NEXT:    vmovdqa %xmm3, 96(%rdx)
712; AVX-NEXT:    vmovdqa %xmm6, 112(%rdx)
713; AVX-NEXT:    vmovdqa %xmm2, 64(%rdx)
714; AVX-NEXT:    vmovdqa %xmm8, 80(%rdx)
715; AVX-NEXT:    retq
716;
717; AVX2-LABEL: store_i8_stride2_vf64:
718; AVX2:       # %bb.0:
719; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
720; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
721; AVX2-NEXT:    vmovdqa (%rsi), %ymm2
722; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm3
723; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
724; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
725; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
726; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
727; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
728; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
729; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
730; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
731; AVX2-NEXT:    vmovdqa %ymm1, 64(%rdx)
732; AVX2-NEXT:    vmovdqa %ymm3, 96(%rdx)
733; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
734; AVX2-NEXT:    vmovdqa %ymm2, 32(%rdx)
735; AVX2-NEXT:    vzeroupper
736; AVX2-NEXT:    retq
737;
738; AVX2-FP-LABEL: store_i8_stride2_vf64:
739; AVX2-FP:       # %bb.0:
740; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
741; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
742; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm2
743; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm3
744; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
745; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
746; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
747; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
748; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
749; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
750; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
751; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
752; AVX2-FP-NEXT:    vmovdqa %ymm1, 64(%rdx)
753; AVX2-FP-NEXT:    vmovdqa %ymm3, 96(%rdx)
754; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rdx)
755; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rdx)
756; AVX2-FP-NEXT:    vzeroupper
757; AVX2-FP-NEXT:    retq
758;
759; AVX2-FCP-LABEL: store_i8_stride2_vf64:
760; AVX2-FCP:       # %bb.0:
761; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
762; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
763; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm2
764; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
765; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
766; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
767; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
768; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
769; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
770; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
771; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
772; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
773; AVX2-FCP-NEXT:    vmovdqa %ymm1, 64(%rdx)
774; AVX2-FCP-NEXT:    vmovdqa %ymm3, 96(%rdx)
775; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
776; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rdx)
777; AVX2-FCP-NEXT:    vzeroupper
778; AVX2-FCP-NEXT:    retq
779;
780; AVX512-LABEL: store_i8_stride2_vf64:
781; AVX512:       # %bb.0:
782; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
783; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm1
784; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
785; AVX512-NEXT:    vmovdqa 48(%rsi), %xmm3
786; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
787; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm5
788; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm6
789; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm7
790; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
791; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
792; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
793; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
794; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
795; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
796; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
797; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
798; AVX512-NEXT:    vmovdqa %xmm3, 96(%rdx)
799; AVX512-NEXT:    vmovdqa %xmm6, 112(%rdx)
800; AVX512-NEXT:    vmovdqa %xmm2, 64(%rdx)
801; AVX512-NEXT:    vmovdqa %xmm5, 80(%rdx)
802; AVX512-NEXT:    vmovdqa %xmm1, 32(%rdx)
803; AVX512-NEXT:    vmovdqa %xmm4, 48(%rdx)
804; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
805; AVX512-NEXT:    vmovdqa %xmm8, 16(%rdx)
806; AVX512-NEXT:    retq
807;
808; AVX512-FCP-LABEL: store_i8_stride2_vf64:
809; AVX512-FCP:       # %bb.0:
810; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
811; AVX512-FCP-NEXT:    vmovdqa 16(%rsi), %xmm1
812; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
813; AVX512-FCP-NEXT:    vmovdqa 48(%rsi), %xmm3
814; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
815; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
816; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
817; AVX512-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
818; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
819; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
820; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
821; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
822; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
823; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
824; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
825; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
826; AVX512-FCP-NEXT:    vmovdqa %xmm3, 96(%rdx)
827; AVX512-FCP-NEXT:    vmovdqa %xmm6, 112(%rdx)
828; AVX512-FCP-NEXT:    vmovdqa %xmm2, 64(%rdx)
829; AVX512-FCP-NEXT:    vmovdqa %xmm5, 80(%rdx)
830; AVX512-FCP-NEXT:    vmovdqa %xmm1, 32(%rdx)
831; AVX512-FCP-NEXT:    vmovdqa %xmm4, 48(%rdx)
832; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
833; AVX512-FCP-NEXT:    vmovdqa %xmm8, 16(%rdx)
834; AVX512-FCP-NEXT:    retq
835;
836; AVX512DQ-LABEL: store_i8_stride2_vf64:
837; AVX512DQ:       # %bb.0:
838; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm0
839; AVX512DQ-NEXT:    vmovdqa 16(%rsi), %xmm1
840; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm2
841; AVX512DQ-NEXT:    vmovdqa 48(%rsi), %xmm3
842; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
843; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm5
844; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm6
845; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm7
846; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
847; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
848; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
849; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
850; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
851; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
852; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
853; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
854; AVX512DQ-NEXT:    vmovdqa %xmm3, 96(%rdx)
855; AVX512DQ-NEXT:    vmovdqa %xmm6, 112(%rdx)
856; AVX512DQ-NEXT:    vmovdqa %xmm2, 64(%rdx)
857; AVX512DQ-NEXT:    vmovdqa %xmm5, 80(%rdx)
858; AVX512DQ-NEXT:    vmovdqa %xmm1, 32(%rdx)
859; AVX512DQ-NEXT:    vmovdqa %xmm4, 48(%rdx)
860; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
861; AVX512DQ-NEXT:    vmovdqa %xmm8, 16(%rdx)
862; AVX512DQ-NEXT:    retq
863;
864; AVX512DQ-FCP-LABEL: store_i8_stride2_vf64:
865; AVX512DQ-FCP:       # %bb.0:
866; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm0
867; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rsi), %xmm1
868; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
869; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rsi), %xmm3
870; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
871; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
872; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
873; AVX512DQ-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
874; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
875; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
876; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
877; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
878; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
879; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
880; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
881; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
882; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, 96(%rdx)
883; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, 112(%rdx)
884; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, 64(%rdx)
885; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, 80(%rdx)
886; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, 32(%rdx)
887; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, 48(%rdx)
888; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rdx)
889; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, 16(%rdx)
890; AVX512DQ-FCP-NEXT:    retq
891;
892; AVX512BW-LABEL: store_i8_stride2_vf64:
893; AVX512BW:       # %bb.0:
894; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm0
895; AVX512BW-NEXT:    vmovdqa 16(%rsi), %xmm1
896; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm2
897; AVX512BW-NEXT:    vmovdqa 48(%rsi), %xmm3
898; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm4
899; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm5
900; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm6
901; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm7
902; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
903; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
904; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
905; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
906; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
907; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
908; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
909; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
910; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
911; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
912; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
913; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
914; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
915; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
916; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
917; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
918; AVX512BW-NEXT:    vzeroupper
919; AVX512BW-NEXT:    retq
920;
921; AVX512BW-FCP-LABEL: store_i8_stride2_vf64:
922; AVX512BW-FCP:       # %bb.0:
923; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm0
924; AVX512BW-FCP-NEXT:    vmovdqa 16(%rsi), %xmm1
925; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
926; AVX512BW-FCP-NEXT:    vmovdqa 48(%rsi), %xmm3
927; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
928; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
929; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
930; AVX512BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
931; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
932; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
933; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
934; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
935; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
936; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
937; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
938; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
939; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
940; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
941; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
942; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
943; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
944; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
945; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rdx)
946; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
947; AVX512BW-FCP-NEXT:    vzeroupper
948; AVX512BW-FCP-NEXT:    retq
949;
950; AVX512DQ-BW-LABEL: store_i8_stride2_vf64:
951; AVX512DQ-BW:       # %bb.0:
952; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm0
953; AVX512DQ-BW-NEXT:    vmovdqa 16(%rsi), %xmm1
954; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm2
955; AVX512DQ-BW-NEXT:    vmovdqa 48(%rsi), %xmm3
956; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm4
957; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm5
958; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm6
959; AVX512DQ-BW-NEXT:    vmovdqa 48(%rdi), %xmm7
960; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
961; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
962; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
963; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
964; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
965; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
966; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
967; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
968; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
969; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
970; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
971; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
972; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
973; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
974; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
975; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
976; AVX512DQ-BW-NEXT:    vzeroupper
977; AVX512DQ-BW-NEXT:    retq
978;
979; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf64:
980; AVX512DQ-BW-FCP:       # %bb.0:
981; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm0
982; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rsi), %xmm1
983; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
984; AVX512DQ-BW-FCP-NEXT:    vmovdqa 48(%rsi), %xmm3
985; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
986; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm5
987; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
988; AVX512DQ-BW-FCP-NEXT:    vmovdqa 48(%rdi), %xmm7
989; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
990; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
991; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
992; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
993; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
994; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
995; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
996; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
997; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
998; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
999; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1000; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1001; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
1002; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1003; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rdx)
1004; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
1005; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1006; AVX512DQ-BW-FCP-NEXT:    retq
1007  %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
1008  %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
1009  %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1010  %interleaved.vec = shufflevector <128 x i8> %1, <128 x i8> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
1011  store <128 x i8> %interleaved.vec, ptr %out.vec, align 64
1012  ret void
1013}
1014