xref: /llvm-project/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll (revision 742e04de96d4094e7070beb9afab10279c8b179e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9
12
13define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
14; SSE2-LABEL: vec16_v2i8_to_v1i16_factor2:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movdqa (%rdi), %xmm0
17; SSE2-NEXT:    paddb (%rsi), %xmm0
18; SSE2-NEXT:    pxor %xmm1, %xmm1
19; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
20; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
21; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
22; SSE2-NEXT:    paddb (%rdx), %xmm0
23; SSE2-NEXT:    movdqa %xmm0, (%rcx)
24; SSE2-NEXT:    retq
25;
26; SSE42-LABEL: vec16_v2i8_to_v1i16_factor2:
27; SSE42:       # %bb.0:
28; SSE42-NEXT:    movdqa (%rdi), %xmm0
29; SSE42-NEXT:    paddb (%rsi), %xmm0
30; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
31; SSE42-NEXT:    paddb (%rdx), %xmm0
32; SSE42-NEXT:    movdqa %xmm0, (%rcx)
33; SSE42-NEXT:    retq
34;
35; AVX-LABEL: vec16_v2i8_to_v1i16_factor2:
36; AVX:       # %bb.0:
37; AVX-NEXT:    vmovdqa (%rdi), %xmm0
38; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
39; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
40; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
41; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
42; AVX-NEXT:    retq
43;
44; AVX2-LABEL: vec16_v2i8_to_v1i16_factor2:
45; AVX2:       # %bb.0:
46; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
47; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
48; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
49; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
50; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
51; AVX2-NEXT:    vzeroupper
52; AVX2-NEXT:    retq
53;
54; AVX512F-LABEL: vec16_v2i8_to_v1i16_factor2:
55; AVX512F:       # %bb.0:
56; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
57; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
58; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
59; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
60; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
61; AVX512F-NEXT:    vzeroupper
62; AVX512F-NEXT:    retq
63;
64; AVX512BW-LABEL: vec16_v2i8_to_v1i16_factor2:
65; AVX512BW:       # %bb.0:
66; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
67; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
68; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
69; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
70; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
71; AVX512BW-NEXT:    vzeroupper
72; AVX512BW-NEXT:    retq
73  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
74  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
75  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
76  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <2 x i32> <i32 0, i32 1>
77  %zextd.vec = shufflevector <2 x i8> %in.vec.trunc, <2 x i8> zeroinitializer, <2 x i32> <i32 0, i32 3>
78  %out.bytevec.padded = shufflevector <2 x i8> %zextd.vec, <2 x i8> poison, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
79  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
80  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
81  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
82  ret void
83}
84
85define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
86; SSE2-LABEL: vec32_v4i8_to_v2i16_factor2:
87; SSE2:       # %bb.0:
88; SSE2-NEXT:    movdqa (%rdi), %xmm0
89; SSE2-NEXT:    paddb (%rsi), %xmm0
90; SSE2-NEXT:    pxor %xmm1, %xmm1
91; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
92; SSE2-NEXT:    paddb (%rdx), %xmm0
93; SSE2-NEXT:    movdqa %xmm0, (%rcx)
94; SSE2-NEXT:    retq
95;
96; SSE42-LABEL: vec32_v4i8_to_v2i16_factor2:
97; SSE42:       # %bb.0:
98; SSE42-NEXT:    movdqa (%rdi), %xmm0
99; SSE42-NEXT:    paddb (%rsi), %xmm0
100; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
101; SSE42-NEXT:    paddb (%rdx), %xmm0
102; SSE42-NEXT:    movdqa %xmm0, (%rcx)
103; SSE42-NEXT:    retq
104;
105; AVX-LABEL: vec32_v4i8_to_v2i16_factor2:
106; AVX:       # %bb.0:
107; AVX-NEXT:    vmovdqa (%rdi), %xmm0
108; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
109; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
110; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
111; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
112; AVX-NEXT:    retq
113;
114; AVX2-LABEL: vec32_v4i8_to_v2i16_factor2:
115; AVX2:       # %bb.0:
116; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
117; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
118; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
119; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
120; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
121; AVX2-NEXT:    vzeroupper
122; AVX2-NEXT:    retq
123;
124; AVX512F-LABEL: vec32_v4i8_to_v2i16_factor2:
125; AVX512F:       # %bb.0:
126; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
127; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
128; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
129; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
130; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
131; AVX512F-NEXT:    vzeroupper
132; AVX512F-NEXT:    retq
133;
134; AVX512BW-LABEL: vec32_v4i8_to_v2i16_factor2:
135; AVX512BW:       # %bb.0:
136; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
137; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
138; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
139; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
140; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
141; AVX512BW-NEXT:    vzeroupper
142; AVX512BW-NEXT:    retq
143  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
144  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
145  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
146  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
147  %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
148  %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
149  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
150  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
151  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
152  ret void
153}
154
155define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
156; SSE2-LABEL: vec32_v4i8_to_v1i32_factor4:
157; SSE2:       # %bb.0:
158; SSE2-NEXT:    movdqa (%rdi), %xmm0
159; SSE2-NEXT:    paddb (%rsi), %xmm0
160; SSE2-NEXT:    pxor %xmm1, %xmm1
161; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
162; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
163; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
164; SSE2-NEXT:    paddb (%rdx), %xmm0
165; SSE2-NEXT:    movdqa %xmm0, (%rcx)
166; SSE2-NEXT:    retq
167;
168; SSE42-LABEL: vec32_v4i8_to_v1i32_factor4:
169; SSE42:       # %bb.0:
170; SSE42-NEXT:    movdqa (%rdi), %xmm0
171; SSE42-NEXT:    paddb (%rsi), %xmm0
172; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
173; SSE42-NEXT:    paddb (%rdx), %xmm0
174; SSE42-NEXT:    movdqa %xmm0, (%rcx)
175; SSE42-NEXT:    retq
176;
177; AVX-LABEL: vec32_v4i8_to_v1i32_factor4:
178; AVX:       # %bb.0:
179; AVX-NEXT:    vmovdqa (%rdi), %xmm0
180; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
181; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
182; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
183; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
184; AVX-NEXT:    retq
185;
186; AVX2-LABEL: vec32_v4i8_to_v1i32_factor4:
187; AVX2:       # %bb.0:
188; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
189; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
190; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
191; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
192; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
193; AVX2-NEXT:    vzeroupper
194; AVX2-NEXT:    retq
195;
196; AVX512F-LABEL: vec32_v4i8_to_v1i32_factor4:
197; AVX512F:       # %bb.0:
198; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
199; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
200; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
201; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
202; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
203; AVX512F-NEXT:    vzeroupper
204; AVX512F-NEXT:    retq
205;
206; AVX512BW-LABEL: vec32_v4i8_to_v1i32_factor4:
207; AVX512BW:       # %bb.0:
208; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
209; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
210; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
211; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
212; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
213; AVX512BW-NEXT:    vzeroupper
214; AVX512BW-NEXT:    retq
215  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
216  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
217  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
218  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
219  %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
220  %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
221  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
222  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
223  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
224  ret void
225}
226
227define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
228; SSE2-LABEL: vec32_v2i16_to_v1i32_factor2:
229; SSE2:       # %bb.0:
230; SSE2-NEXT:    movdqa (%rdi), %xmm0
231; SSE2-NEXT:    paddb (%rsi), %xmm0
232; SSE2-NEXT:    pxor %xmm1, %xmm1
233; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
234; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
235; SSE2-NEXT:    paddb (%rdx), %xmm0
236; SSE2-NEXT:    movdqa %xmm0, (%rcx)
237; SSE2-NEXT:    retq
238;
239; SSE42-LABEL: vec32_v2i16_to_v1i32_factor2:
240; SSE42:       # %bb.0:
241; SSE42-NEXT:    movdqa (%rdi), %xmm0
242; SSE42-NEXT:    paddb (%rsi), %xmm0
243; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
244; SSE42-NEXT:    paddb (%rdx), %xmm0
245; SSE42-NEXT:    movdqa %xmm0, (%rcx)
246; SSE42-NEXT:    retq
247;
248; AVX-LABEL: vec32_v2i16_to_v1i32_factor2:
249; AVX:       # %bb.0:
250; AVX-NEXT:    vmovdqa (%rdi), %xmm0
251; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
252; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
253; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
254; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
255; AVX-NEXT:    retq
256;
257; AVX2-LABEL: vec32_v2i16_to_v1i32_factor2:
258; AVX2:       # %bb.0:
259; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
260; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
261; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
262; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
263; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
264; AVX2-NEXT:    vzeroupper
265; AVX2-NEXT:    retq
266;
267; AVX512F-LABEL: vec32_v2i16_to_v1i32_factor2:
268; AVX512F:       # %bb.0:
269; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
270; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
271; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
272; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
273; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
274; AVX512F-NEXT:    vzeroupper
275; AVX512F-NEXT:    retq
276;
277; AVX512BW-LABEL: vec32_v2i16_to_v1i32_factor2:
278; AVX512BW:       # %bb.0:
279; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
280; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
281; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
282; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
283; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
284; AVX512BW-NEXT:    vzeroupper
285; AVX512BW-NEXT:    retq
286  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
287  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
288  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
289  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
290  %in.vec.cast = bitcast <4 x i8> %in.vec.trunc to <2 x i16>
291  %zextd.vec = shufflevector <2 x i16> %in.vec.cast, <2 x i16> zeroinitializer, <2 x i32> <i32 0, i32 3>
292  %out.bytevec = bitcast <2 x i16> %zextd.vec to <4 x i8>
293  %out.bytevec.padded = shufflevector <4 x i8> %out.bytevec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
294  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
295  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
296  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
297  ret void
298}
299
300define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
301; SSE2-LABEL: vec64_v8i8_to_v4i16_factor2:
302; SSE2:       # %bb.0:
303; SSE2-NEXT:    movdqa (%rdi), %xmm0
304; SSE2-NEXT:    paddb (%rsi), %xmm0
305; SSE2-NEXT:    pxor %xmm1, %xmm1
306; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
307; SSE2-NEXT:    paddb (%rdx), %xmm0
308; SSE2-NEXT:    movdqa %xmm0, (%rcx)
309; SSE2-NEXT:    retq
310;
311; SSE42-LABEL: vec64_v8i8_to_v4i16_factor2:
312; SSE42:       # %bb.0:
313; SSE42-NEXT:    movdqa (%rdi), %xmm0
314; SSE42-NEXT:    paddb (%rsi), %xmm0
315; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
316; SSE42-NEXT:    paddb (%rdx), %xmm0
317; SSE42-NEXT:    movdqa %xmm0, (%rcx)
318; SSE42-NEXT:    retq
319;
320; AVX-LABEL: vec64_v8i8_to_v4i16_factor2:
321; AVX:       # %bb.0:
322; AVX-NEXT:    vmovdqa (%rdi), %xmm0
323; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
324; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
325; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
326; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
327; AVX-NEXT:    retq
328;
329; AVX2-LABEL: vec64_v8i8_to_v4i16_factor2:
330; AVX2:       # %bb.0:
331; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
332; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
333; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
334; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
335; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
336; AVX2-NEXT:    vzeroupper
337; AVX2-NEXT:    retq
338;
339; AVX512F-LABEL: vec64_v8i8_to_v4i16_factor2:
340; AVX512F:       # %bb.0:
341; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
342; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
343; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
344; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
345; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
346; AVX512F-NEXT:    vzeroupper
347; AVX512F-NEXT:    retq
348;
349; AVX512BW-LABEL: vec64_v8i8_to_v4i16_factor2:
350; AVX512BW:       # %bb.0:
351; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
352; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
353; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
354; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
355; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
356; AVX512BW-NEXT:    vzeroupper
357; AVX512BW-NEXT:    retq
358  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
359  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
360  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
361  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
362  %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
363  %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
364  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
365  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
366  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
367  ret void
368}
369
370define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
371; SSE2-LABEL: vec64_v8i8_to_v2i32_factor4:
372; SSE2:       # %bb.0:
373; SSE2-NEXT:    movdqa (%rdi), %xmm0
374; SSE2-NEXT:    paddb (%rsi), %xmm0
375; SSE2-NEXT:    pxor %xmm1, %xmm1
376; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
377; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
378; SSE2-NEXT:    paddb (%rdx), %xmm0
379; SSE2-NEXT:    movdqa %xmm0, (%rcx)
380; SSE2-NEXT:    retq
381;
382; SSE42-LABEL: vec64_v8i8_to_v2i32_factor4:
383; SSE42:       # %bb.0:
384; SSE42-NEXT:    movdqa (%rdi), %xmm0
385; SSE42-NEXT:    paddb (%rsi), %xmm0
386; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
387; SSE42-NEXT:    paddb (%rdx), %xmm0
388; SSE42-NEXT:    movdqa %xmm0, (%rcx)
389; SSE42-NEXT:    retq
390;
391; AVX-LABEL: vec64_v8i8_to_v2i32_factor4:
392; AVX:       # %bb.0:
393; AVX-NEXT:    vmovdqa (%rdi), %xmm0
394; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
395; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
396; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
397; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
398; AVX-NEXT:    retq
399;
400; AVX2-LABEL: vec64_v8i8_to_v2i32_factor4:
401; AVX2:       # %bb.0:
402; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
403; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
404; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
405; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
406; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
407; AVX2-NEXT:    vzeroupper
408; AVX2-NEXT:    retq
409;
410; AVX512F-LABEL: vec64_v8i8_to_v2i32_factor4:
411; AVX512F:       # %bb.0:
412; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
413; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
414; AVX512F-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
415; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
416; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
417; AVX512F-NEXT:    vzeroupper
418; AVX512F-NEXT:    retq
419;
420; AVX512BW-LABEL: vec64_v8i8_to_v2i32_factor4:
421; AVX512BW:       # %bb.0:
422; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
423; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
424; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
425; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
426; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
427; AVX512BW-NEXT:    vzeroupper
428; AVX512BW-NEXT:    retq
429  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
430  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
431  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
432  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
433  %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
434  %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
435  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
436  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
437  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
438  ret void
439}
440
441define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
442; SSE2-LABEL: vec64_v8i8_to_v1i64_factor8:
443; SSE2:       # %bb.0:
444; SSE2-NEXT:    movdqa (%rdi), %xmm0
445; SSE2-NEXT:    paddb (%rsi), %xmm0
446; SSE2-NEXT:    pxor %xmm1, %xmm1
447; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
448; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
449; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
450; SSE2-NEXT:    paddb (%rdx), %xmm0
451; SSE2-NEXT:    movdqa %xmm0, (%rcx)
452; SSE2-NEXT:    retq
453;
454; SSE42-LABEL: vec64_v8i8_to_v1i64_factor8:
455; SSE42:       # %bb.0:
456; SSE42-NEXT:    movdqa (%rdi), %xmm0
457; SSE42-NEXT:    paddb (%rsi), %xmm0
458; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
459; SSE42-NEXT:    paddb (%rdx), %xmm0
460; SSE42-NEXT:    movdqa %xmm0, (%rcx)
461; SSE42-NEXT:    retq
462;
463; AVX-LABEL: vec64_v8i8_to_v1i64_factor8:
464; AVX:       # %bb.0:
465; AVX-NEXT:    vmovdqa (%rdi), %xmm0
466; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
467; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
468; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
469; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
470; AVX-NEXT:    retq
471;
472; AVX2-LABEL: vec64_v8i8_to_v1i64_factor8:
473; AVX2:       # %bb.0:
474; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
475; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
476; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
477; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
478; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
479; AVX2-NEXT:    vzeroupper
480; AVX2-NEXT:    retq
481;
482; AVX512F-LABEL: vec64_v8i8_to_v1i64_factor8:
483; AVX512F:       # %bb.0:
484; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
485; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
486; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
487; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
488; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
489; AVX512F-NEXT:    vzeroupper
490; AVX512F-NEXT:    retq
491;
492; AVX512BW-LABEL: vec64_v8i8_to_v1i64_factor8:
493; AVX512BW:       # %bb.0:
494; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
495; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
496; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
497; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
498; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
499; AVX512BW-NEXT:    vzeroupper
500; AVX512BW-NEXT:    retq
501  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
502  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
503  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
504  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
505  %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
506  %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
507  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
508  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
509  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
510  ret void
511}
512
513define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
514; SSE2-LABEL: vec64_v4i16_to_v2i32_factor2:
515; SSE2:       # %bb.0:
516; SSE2-NEXT:    movdqa (%rdi), %xmm0
517; SSE2-NEXT:    paddb (%rsi), %xmm0
518; SSE2-NEXT:    pxor %xmm1, %xmm1
519; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
520; SSE2-NEXT:    paddb (%rdx), %xmm0
521; SSE2-NEXT:    movdqa %xmm0, (%rcx)
522; SSE2-NEXT:    retq
523;
524; SSE42-LABEL: vec64_v4i16_to_v2i32_factor2:
525; SSE42:       # %bb.0:
526; SSE42-NEXT:    movdqa (%rdi), %xmm0
527; SSE42-NEXT:    paddb (%rsi), %xmm0
528; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
529; SSE42-NEXT:    paddb (%rdx), %xmm0
530; SSE42-NEXT:    movdqa %xmm0, (%rcx)
531; SSE42-NEXT:    retq
532;
533; AVX-LABEL: vec64_v4i16_to_v2i32_factor2:
534; AVX:       # %bb.0:
535; AVX-NEXT:    vmovdqa (%rdi), %xmm0
536; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
537; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
538; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
539; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
540; AVX-NEXT:    retq
541;
542; AVX2-LABEL: vec64_v4i16_to_v2i32_factor2:
543; AVX2:       # %bb.0:
544; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
545; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
546; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
547; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
548; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
549; AVX2-NEXT:    vzeroupper
550; AVX2-NEXT:    retq
551;
552; AVX512F-LABEL: vec64_v4i16_to_v2i32_factor2:
553; AVX512F:       # %bb.0:
554; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
555; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
556; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
557; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
558; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
559; AVX512F-NEXT:    vzeroupper
560; AVX512F-NEXT:    retq
561;
562; AVX512BW-LABEL: vec64_v4i16_to_v2i32_factor2:
563; AVX512BW:       # %bb.0:
564; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
565; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
566; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
567; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
568; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
569; AVX512BW-NEXT:    vzeroupper
570; AVX512BW-NEXT:    retq
571  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
572  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
573  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
574  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
575  %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16>
576  %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
577  %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8>
578  %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
579  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
580  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
581  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
582  ret void
583}
584
585define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
586; SSE2-LABEL: vec64_v4i16_to_v1i64_factor4:
587; SSE2:       # %bb.0:
588; SSE2-NEXT:    movdqa (%rdi), %xmm0
589; SSE2-NEXT:    paddb (%rsi), %xmm0
590; SSE2-NEXT:    pxor %xmm1, %xmm1
591; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
592; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
593; SSE2-NEXT:    paddb (%rdx), %xmm0
594; SSE2-NEXT:    movdqa %xmm0, (%rcx)
595; SSE2-NEXT:    retq
596;
597; SSE42-LABEL: vec64_v4i16_to_v1i64_factor4:
598; SSE42:       # %bb.0:
599; SSE42-NEXT:    movdqa (%rdi), %xmm0
600; SSE42-NEXT:    paddb (%rsi), %xmm0
601; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
602; SSE42-NEXT:    paddb (%rdx), %xmm0
603; SSE42-NEXT:    movdqa %xmm0, (%rcx)
604; SSE42-NEXT:    retq
605;
606; AVX-LABEL: vec64_v4i16_to_v1i64_factor4:
607; AVX:       # %bb.0:
608; AVX-NEXT:    vmovdqa (%rdi), %xmm0
609; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
610; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
611; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
612; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
613; AVX-NEXT:    retq
614;
615; AVX2-LABEL: vec64_v4i16_to_v1i64_factor4:
616; AVX2:       # %bb.0:
617; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
618; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
619; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
620; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
621; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
622; AVX2-NEXT:    vzeroupper
623; AVX2-NEXT:    retq
624;
625; AVX512F-LABEL: vec64_v4i16_to_v1i64_factor4:
626; AVX512F:       # %bb.0:
627; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
628; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
629; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
630; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
631; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
632; AVX512F-NEXT:    vzeroupper
633; AVX512F-NEXT:    retq
634;
635; AVX512BW-LABEL: vec64_v4i16_to_v1i64_factor4:
636; AVX512BW:       # %bb.0:
637; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
638; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
639; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
640; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
641; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
642; AVX512BW-NEXT:    vzeroupper
643; AVX512BW-NEXT:    retq
644  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
645  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
646  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
647  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
648  %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16>
649  %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
650  %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8>
651  %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
652  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
653  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
654  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
655  ret void
656}
657
658define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
659; SSE2-LABEL: vec64_v2i32_to_v1i64_factor2:
660; SSE2:       # %bb.0:
661; SSE2-NEXT:    movdqa (%rdi), %xmm0
662; SSE2-NEXT:    paddb (%rsi), %xmm0
663; SSE2-NEXT:    pxor %xmm1, %xmm1
664; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
665; SSE2-NEXT:    paddb (%rdx), %xmm0
666; SSE2-NEXT:    movdqa %xmm0, (%rcx)
667; SSE2-NEXT:    retq
668;
669; SSE42-LABEL: vec64_v2i32_to_v1i64_factor2:
670; SSE42:       # %bb.0:
671; SSE42-NEXT:    movdqa (%rdi), %xmm0
672; SSE42-NEXT:    paddb (%rsi), %xmm0
673; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
674; SSE42-NEXT:    paddb (%rdx), %xmm0
675; SSE42-NEXT:    movdqa %xmm0, (%rcx)
676; SSE42-NEXT:    retq
677;
678; AVX-LABEL: vec64_v2i32_to_v1i64_factor2:
679; AVX:       # %bb.0:
680; AVX-NEXT:    vmovdqa (%rdi), %xmm0
681; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
682; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
683; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
684; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
685; AVX-NEXT:    retq
686;
687; AVX2-LABEL: vec64_v2i32_to_v1i64_factor2:
688; AVX2:       # %bb.0:
689; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
690; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
691; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
692; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
693; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
694; AVX2-NEXT:    vzeroupper
695; AVX2-NEXT:    retq
696;
697; AVX512F-LABEL: vec64_v2i32_to_v1i64_factor2:
698; AVX512F:       # %bb.0:
699; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
700; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
701; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
702; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
703; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
704; AVX512F-NEXT:    vzeroupper
705; AVX512F-NEXT:    retq
706;
707; AVX512BW-LABEL: vec64_v2i32_to_v1i64_factor2:
708; AVX512BW:       # %bb.0:
709; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
710; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
711; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
712; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
713; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
714; AVX512BW-NEXT:    vzeroupper
715; AVX512BW-NEXT:    retq
716  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
717  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
718  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
719  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
720  %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <2 x i32>
721  %zextd.vec = shufflevector <2 x i32> %in.vec.cast, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 3>
722  %out.bytevec = bitcast <2 x i32> %zextd.vec to <8 x i8>
723  %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
724  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
725  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
726  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
727  ret void
728}
729
730define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
731; SSE2-LABEL: vec128_v16i8_to_v8i16_factor2:
732; SSE2:       # %bb.0:
733; SSE2-NEXT:    movdqa (%rdi), %xmm0
734; SSE2-NEXT:    paddb (%rsi), %xmm0
735; SSE2-NEXT:    pxor %xmm1, %xmm1
736; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
737; SSE2-NEXT:    paddb (%rdx), %xmm0
738; SSE2-NEXT:    movdqa %xmm0, (%rcx)
739; SSE2-NEXT:    retq
740;
741; SSE42-LABEL: vec128_v16i8_to_v8i16_factor2:
742; SSE42:       # %bb.0:
743; SSE42-NEXT:    movdqa (%rdi), %xmm0
744; SSE42-NEXT:    paddb (%rsi), %xmm0
745; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
746; SSE42-NEXT:    paddb (%rdx), %xmm0
747; SSE42-NEXT:    movdqa %xmm0, (%rcx)
748; SSE42-NEXT:    retq
749;
750; AVX-LABEL: vec128_v16i8_to_v8i16_factor2:
751; AVX:       # %bb.0:
752; AVX-NEXT:    vmovdqa (%rdi), %xmm0
753; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
754; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
755; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
756; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
757; AVX-NEXT:    retq
758;
759; AVX2-LABEL: vec128_v16i8_to_v8i16_factor2:
760; AVX2:       # %bb.0:
761; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
762; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
763; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
764; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
765; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
766; AVX2-NEXT:    vzeroupper
767; AVX2-NEXT:    retq
768;
769; AVX512F-LABEL: vec128_v16i8_to_v8i16_factor2:
770; AVX512F:       # %bb.0:
771; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
772; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
773; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
774; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
775; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
776; AVX512F-NEXT:    vzeroupper
777; AVX512F-NEXT:    retq
778;
779; AVX512BW-LABEL: vec128_v16i8_to_v8i16_factor2:
780; AVX512BW:       # %bb.0:
781; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
782; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
783; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
784; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
785; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
786; AVX512BW-NEXT:    vzeroupper
787; AVX512BW-NEXT:    retq
788  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
789  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
790  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
791  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
792  %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
793  %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
794  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
795  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
796  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
797  ret void
798}
799
800define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
801; SSE2-LABEL: vec128_v16i8_to_v4i32_factor4:
802; SSE2:       # %bb.0:
803; SSE2-NEXT:    movdqa (%rdi), %xmm0
804; SSE2-NEXT:    paddb (%rsi), %xmm0
805; SSE2-NEXT:    pxor %xmm1, %xmm1
806; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
807; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
808; SSE2-NEXT:    paddb (%rdx), %xmm0
809; SSE2-NEXT:    movdqa %xmm0, (%rcx)
810; SSE2-NEXT:    retq
811;
812; SSE42-LABEL: vec128_v16i8_to_v4i32_factor4:
813; SSE42:       # %bb.0:
814; SSE42-NEXT:    movdqa (%rdi), %xmm0
815; SSE42-NEXT:    paddb (%rsi), %xmm0
816; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
817; SSE42-NEXT:    paddb (%rdx), %xmm0
818; SSE42-NEXT:    movdqa %xmm0, (%rcx)
819; SSE42-NEXT:    retq
820;
821; AVX-LABEL: vec128_v16i8_to_v4i32_factor4:
822; AVX:       # %bb.0:
823; AVX-NEXT:    vmovdqa (%rdi), %xmm0
824; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
825; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
826; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
827; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
828; AVX-NEXT:    retq
829;
830; AVX2-LABEL: vec128_v16i8_to_v4i32_factor4:
831; AVX2:       # %bb.0:
832; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
833; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
834; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
835; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
836; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
837; AVX2-NEXT:    vzeroupper
838; AVX2-NEXT:    retq
839;
840; AVX512F-LABEL: vec128_v16i8_to_v4i32_factor4:
841; AVX512F:       # %bb.0:
842; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
843; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
844; AVX512F-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
845; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
846; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
847; AVX512F-NEXT:    vzeroupper
848; AVX512F-NEXT:    retq
849;
850; AVX512BW-LABEL: vec128_v16i8_to_v4i32_factor4:
851; AVX512BW:       # %bb.0:
852; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
853; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
854; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
855; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
856; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
857; AVX512BW-NEXT:    vzeroupper
858; AVX512BW-NEXT:    retq
859  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
860  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
861  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
862  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
863  %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
864  %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
865  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
866  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
867  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
868  ret void
869}
870
871define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
872; SSE2-LABEL: vec128_v16i8_to_v2i64_factor8:
873; SSE2:       # %bb.0:
874; SSE2-NEXT:    movdqa (%rdi), %xmm0
875; SSE2-NEXT:    paddb (%rsi), %xmm0
876; SSE2-NEXT:    pxor %xmm1, %xmm1
877; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
878; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
879; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
880; SSE2-NEXT:    paddb (%rdx), %xmm0
881; SSE2-NEXT:    movdqa %xmm0, (%rcx)
882; SSE2-NEXT:    retq
883;
884; SSE42-LABEL: vec128_v16i8_to_v2i64_factor8:
885; SSE42:       # %bb.0:
886; SSE42-NEXT:    movdqa (%rdi), %xmm0
887; SSE42-NEXT:    paddb (%rsi), %xmm0
888; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
889; SSE42-NEXT:    paddb (%rdx), %xmm0
890; SSE42-NEXT:    movdqa %xmm0, (%rcx)
891; SSE42-NEXT:    retq
892;
893; AVX-LABEL: vec128_v16i8_to_v2i64_factor8:
894; AVX:       # %bb.0:
895; AVX-NEXT:    vmovdqa (%rdi), %xmm0
896; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
897; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
898; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
899; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
900; AVX-NEXT:    retq
901;
902; AVX2-LABEL: vec128_v16i8_to_v2i64_factor8:
903; AVX2:       # %bb.0:
904; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
905; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
906; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
907; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
908; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
909; AVX2-NEXT:    vzeroupper
910; AVX2-NEXT:    retq
911;
912; AVX512F-LABEL: vec128_v16i8_to_v2i64_factor8:
913; AVX512F:       # %bb.0:
914; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
915; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
916; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
917; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
918; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
919; AVX512F-NEXT:    vzeroupper
920; AVX512F-NEXT:    retq
921;
922; AVX512BW-LABEL: vec128_v16i8_to_v2i64_factor8:
923; AVX512BW:       # %bb.0:
924; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
925; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
926; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
927; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
928; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
929; AVX512BW-NEXT:    vzeroupper
930; AVX512BW-NEXT:    retq
931  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
932  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
933  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
934  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
935  %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
936  %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
937  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
938  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
939  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
940  ret void
941}
942
943define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
944; SSE-LABEL: vec128_v16i8_to_v1i128_factor16:
945; SSE:       # %bb.0:
946; SSE-NEXT:    movdqa (%rdi), %xmm0
947; SSE-NEXT:    paddb (%rsi), %xmm0
948; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
949; SSE-NEXT:    paddb (%rdx), %xmm0
950; SSE-NEXT:    movdqa %xmm0, (%rcx)
951; SSE-NEXT:    retq
952;
953; AVX-LABEL: vec128_v16i8_to_v1i128_factor16:
954; AVX:       # %bb.0:
955; AVX-NEXT:    vmovdqa (%rdi), %xmm0
956; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
957; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
958; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
959; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
960; AVX-NEXT:    retq
961;
962; AVX2-LABEL: vec128_v16i8_to_v1i128_factor16:
963; AVX2:       # %bb.0:
964; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
965; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
966; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
967; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
968; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
969; AVX2-NEXT:    vzeroupper
970; AVX2-NEXT:    retq
971;
972; AVX512F-LABEL: vec128_v16i8_to_v1i128_factor16:
973; AVX512F:       # %bb.0:
974; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
975; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
976; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
977; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
978; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
979; AVX512F-NEXT:    vzeroupper
980; AVX512F-NEXT:    retq
981;
982; AVX512BW-LABEL: vec128_v16i8_to_v1i128_factor16:
983; AVX512BW:       # %bb.0:
984; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
985; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
986; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
987; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
988; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
989; AVX512BW-NEXT:    vzeroupper
990; AVX512BW-NEXT:    retq
991  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
992  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
993  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
994  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
995  %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
996  %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
997  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
998  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
999  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1000  ret void
1001}
1002
1003define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1004; SSE2-LABEL: vec128_v8i16_to_v4i32_factor2:
1005; SSE2:       # %bb.0:
1006; SSE2-NEXT:    movdqa (%rdi), %xmm0
1007; SSE2-NEXT:    paddb (%rsi), %xmm0
1008; SSE2-NEXT:    pxor %xmm1, %xmm1
1009; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1010; SSE2-NEXT:    paddb (%rdx), %xmm0
1011; SSE2-NEXT:    movdqa %xmm0, (%rcx)
1012; SSE2-NEXT:    retq
1013;
1014; SSE42-LABEL: vec128_v8i16_to_v4i32_factor2:
1015; SSE42:       # %bb.0:
1016; SSE42-NEXT:    movdqa (%rdi), %xmm0
1017; SSE42-NEXT:    paddb (%rsi), %xmm0
1018; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1019; SSE42-NEXT:    paddb (%rdx), %xmm0
1020; SSE42-NEXT:    movdqa %xmm0, (%rcx)
1021; SSE42-NEXT:    retq
1022;
1023; AVX-LABEL: vec128_v8i16_to_v4i32_factor2:
1024; AVX:       # %bb.0:
1025; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1026; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1027; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1028; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1029; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1030; AVX-NEXT:    retq
1031;
1032; AVX2-LABEL: vec128_v8i16_to_v4i32_factor2:
1033; AVX2:       # %bb.0:
1034; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1035; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1036; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1037; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1038; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1039; AVX2-NEXT:    vzeroupper
1040; AVX2-NEXT:    retq
1041;
1042; AVX512F-LABEL: vec128_v8i16_to_v4i32_factor2:
1043; AVX512F:       # %bb.0:
1044; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1045; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1046; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1047; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1048; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1049; AVX512F-NEXT:    vzeroupper
1050; AVX512F-NEXT:    retq
1051;
1052; AVX512BW-LABEL: vec128_v8i16_to_v4i32_factor2:
1053; AVX512BW:       # %bb.0:
1054; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1055; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1056; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1057; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1058; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1059; AVX512BW-NEXT:    vzeroupper
1060; AVX512BW-NEXT:    retq
1061  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1062  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1063  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1064  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1065  %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16>
1066  %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
1067  %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8>
1068  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1069  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1070  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1071  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1072  ret void
1073}
1074
1075define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1076; SSE2-LABEL: vec128_v8i16_to_v2i64_factor4:
1077; SSE2:       # %bb.0:
1078; SSE2-NEXT:    movdqa (%rdi), %xmm0
1079; SSE2-NEXT:    paddb (%rsi), %xmm0
1080; SSE2-NEXT:    pxor %xmm1, %xmm1
1081; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1082; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1083; SSE2-NEXT:    paddb (%rdx), %xmm0
1084; SSE2-NEXT:    movdqa %xmm0, (%rcx)
1085; SSE2-NEXT:    retq
1086;
1087; SSE42-LABEL: vec128_v8i16_to_v2i64_factor4:
1088; SSE42:       # %bb.0:
1089; SSE42-NEXT:    movdqa (%rdi), %xmm0
1090; SSE42-NEXT:    paddb (%rsi), %xmm0
1091; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1092; SSE42-NEXT:    paddb (%rdx), %xmm0
1093; SSE42-NEXT:    movdqa %xmm0, (%rcx)
1094; SSE42-NEXT:    retq
1095;
1096; AVX-LABEL: vec128_v8i16_to_v2i64_factor4:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1099; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1100; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1101; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1102; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1103; AVX-NEXT:    retq
1104;
1105; AVX2-LABEL: vec128_v8i16_to_v2i64_factor4:
1106; AVX2:       # %bb.0:
1107; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1108; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1109; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1110; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1111; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1112; AVX2-NEXT:    vzeroupper
1113; AVX2-NEXT:    retq
1114;
1115; AVX512F-LABEL: vec128_v8i16_to_v2i64_factor4:
1116; AVX512F:       # %bb.0:
1117; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1118; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1119; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1120; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1121; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1122; AVX512F-NEXT:    vzeroupper
1123; AVX512F-NEXT:    retq
1124;
1125; AVX512BW-LABEL: vec128_v8i16_to_v2i64_factor4:
1126; AVX512BW:       # %bb.0:
1127; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1128; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1129; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1130; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1131; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1132; AVX512BW-NEXT:    vzeroupper
1133; AVX512BW-NEXT:    retq
1134  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1135  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1136  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1137  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1138  %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16>
1139  %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
1140  %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8>
1141  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1142  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1143  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1144  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1145  ret void
1146}
1147
1148define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1149; SSE2-LABEL: vec128_v8i16_to_v1i128_factor8:
1150; SSE2:       # %bb.0:
1151; SSE2-NEXT:    movdqa (%rdi), %xmm0
1152; SSE2-NEXT:    paddb (%rsi), %xmm0
1153; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1154; SSE2-NEXT:    paddb (%rdx), %xmm0
1155; SSE2-NEXT:    movdqa %xmm0, (%rcx)
1156; SSE2-NEXT:    retq
1157;
1158; SSE42-LABEL: vec128_v8i16_to_v1i128_factor8:
1159; SSE42:       # %bb.0:
1160; SSE42-NEXT:    movdqa (%rdi), %xmm0
1161; SSE42-NEXT:    paddb (%rsi), %xmm0
1162; SSE42-NEXT:    pxor %xmm1, %xmm1
1163; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1164; SSE42-NEXT:    paddb (%rdx), %xmm1
1165; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1166; SSE42-NEXT:    retq
1167;
1168; AVX-LABEL: vec128_v8i16_to_v1i128_factor8:
1169; AVX:       # %bb.0:
1170; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1171; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1172; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1173; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1174; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1175; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1176; AVX-NEXT:    retq
1177;
1178; AVX2-LABEL: vec128_v8i16_to_v1i128_factor8:
1179; AVX2:       # %bb.0:
1180; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1181; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1182; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1183; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1184; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1185; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1186; AVX2-NEXT:    vzeroupper
1187; AVX2-NEXT:    retq
1188;
1189; AVX512F-LABEL: vec128_v8i16_to_v1i128_factor8:
1190; AVX512F:       # %bb.0:
1191; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1192; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1193; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1194; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1195; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1196; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1197; AVX512F-NEXT:    vzeroupper
1198; AVX512F-NEXT:    retq
1199;
1200; AVX512BW-LABEL: vec128_v8i16_to_v1i128_factor8:
1201; AVX512BW:       # %bb.0:
1202; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1203; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1204; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1205; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1206; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1207; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1208; AVX512BW-NEXT:    vzeroupper
1209; AVX512BW-NEXT:    retq
1210  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1211  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1212  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1213  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1214  %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16>
1215  %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1216  %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8>
1217  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1218  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1219  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1220  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1221  ret void
1222}
1223
1224define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1225; SSE2-LABEL: vec128_v4i32_to_v2i64_factor2:
1226; SSE2:       # %bb.0:
1227; SSE2-NEXT:    movdqa (%rdi), %xmm0
1228; SSE2-NEXT:    paddb (%rsi), %xmm0
1229; SSE2-NEXT:    pxor %xmm1, %xmm1
1230; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1231; SSE2-NEXT:    paddb (%rdx), %xmm0
1232; SSE2-NEXT:    movdqa %xmm0, (%rcx)
1233; SSE2-NEXT:    retq
1234;
1235; SSE42-LABEL: vec128_v4i32_to_v2i64_factor2:
1236; SSE42:       # %bb.0:
1237; SSE42-NEXT:    movdqa (%rdi), %xmm0
1238; SSE42-NEXT:    paddb (%rsi), %xmm0
1239; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1240; SSE42-NEXT:    paddb (%rdx), %xmm0
1241; SSE42-NEXT:    movdqa %xmm0, (%rcx)
1242; SSE42-NEXT:    retq
1243;
1244; AVX-LABEL: vec128_v4i32_to_v2i64_factor2:
1245; AVX:       # %bb.0:
1246; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1247; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1248; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1249; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1250; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1251; AVX-NEXT:    retq
1252;
1253; AVX2-LABEL: vec128_v4i32_to_v2i64_factor2:
1254; AVX2:       # %bb.0:
1255; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1256; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1257; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1258; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1259; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1260; AVX2-NEXT:    vzeroupper
1261; AVX2-NEXT:    retq
1262;
1263; AVX512F-LABEL: vec128_v4i32_to_v2i64_factor2:
1264; AVX512F:       # %bb.0:
1265; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1266; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1267; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1268; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1269; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1270; AVX512F-NEXT:    vzeroupper
1271; AVX512F-NEXT:    retq
1272;
1273; AVX512BW-LABEL: vec128_v4i32_to_v2i64_factor2:
1274; AVX512BW:       # %bb.0:
1275; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1276; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1277; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1278; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1279; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1280; AVX512BW-NEXT:    vzeroupper
1281; AVX512BW-NEXT:    retq
1282  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1283  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1284  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1285  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1286  %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32>
1287  %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1288  %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8>
1289  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1290  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1291  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1292  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1293  ret void
1294}
1295
1296define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1297; SSE2-LABEL: vec128_v4i32_to_v1i128_factor4:
1298; SSE2:       # %bb.0:
1299; SSE2-NEXT:    movdqa (%rdi), %xmm0
1300; SSE2-NEXT:    paddb (%rsi), %xmm0
1301; SSE2-NEXT:    xorps %xmm1, %xmm1
1302; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1303; SSE2-NEXT:    paddb (%rdx), %xmm1
1304; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1305; SSE2-NEXT:    retq
1306;
1307; SSE42-LABEL: vec128_v4i32_to_v1i128_factor4:
1308; SSE42:       # %bb.0:
1309; SSE42-NEXT:    movdqa (%rdi), %xmm0
1310; SSE42-NEXT:    paddb (%rsi), %xmm0
1311; SSE42-NEXT:    pxor %xmm1, %xmm1
1312; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1313; SSE42-NEXT:    paddb (%rdx), %xmm1
1314; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1315; SSE42-NEXT:    retq
1316;
1317; AVX-LABEL: vec128_v4i32_to_v1i128_factor4:
1318; AVX:       # %bb.0:
1319; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1320; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1321; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1322; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1323; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1324; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1325; AVX-NEXT:    retq
1326;
1327; AVX2-LABEL: vec128_v4i32_to_v1i128_factor4:
1328; AVX2:       # %bb.0:
1329; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1330; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1331; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1332; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1333; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1334; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1335; AVX2-NEXT:    vzeroupper
1336; AVX2-NEXT:    retq
1337;
1338; AVX512F-LABEL: vec128_v4i32_to_v1i128_factor4:
1339; AVX512F:       # %bb.0:
1340; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1341; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1342; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1343; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1344; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1345; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1346; AVX512F-NEXT:    vzeroupper
1347; AVX512F-NEXT:    retq
1348;
1349; AVX512BW-LABEL: vec128_v4i32_to_v1i128_factor4:
1350; AVX512BW:       # %bb.0:
1351; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1352; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1353; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1354; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1355; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1356; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1357; AVX512BW-NEXT:    vzeroupper
1358; AVX512BW-NEXT:    retq
1359  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1360  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1361  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1362  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1363  %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32>
1364  %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1365  %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8>
1366  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1367  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1368  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1369  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1370  ret void
1371}
1372
1373define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1374; SSE-LABEL: vec128_v2i64_to_v1i128_factor2:
1375; SSE:       # %bb.0:
1376; SSE-NEXT:    movdqa (%rdi), %xmm0
1377; SSE-NEXT:    paddb (%rsi), %xmm0
1378; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1379; SSE-NEXT:    paddb (%rdx), %xmm0
1380; SSE-NEXT:    movdqa %xmm0, (%rcx)
1381; SSE-NEXT:    retq
1382;
1383; AVX-LABEL: vec128_v2i64_to_v1i128_factor2:
1384; AVX:       # %bb.0:
1385; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1386; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1387; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1388; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1389; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1390; AVX-NEXT:    retq
1391;
1392; AVX2-LABEL: vec128_v2i64_to_v1i128_factor2:
1393; AVX2:       # %bb.0:
1394; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1395; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1396; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1397; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1398; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1399; AVX2-NEXT:    vzeroupper
1400; AVX2-NEXT:    retq
1401;
1402; AVX512F-LABEL: vec128_v2i64_to_v1i128_factor2:
1403; AVX512F:       # %bb.0:
1404; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1405; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1406; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1407; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1408; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1409; AVX512F-NEXT:    vzeroupper
1410; AVX512F-NEXT:    retq
1411;
1412; AVX512BW-LABEL: vec128_v2i64_to_v1i128_factor2:
1413; AVX512BW:       # %bb.0:
1414; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1415; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1416; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1417; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1418; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1419; AVX512BW-NEXT:    vzeroupper
1420; AVX512BW-NEXT:    retq
1421  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1422  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1423  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1424  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1425  %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <2 x i64>
1426  %zextd.vec = shufflevector <2 x i64> %in.vec.cast, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
1427  %out.bytevec = bitcast <2 x i64> %zextd.vec to <16 x i8>
1428  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1429  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1430  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1431  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1432  ret void
1433}
1434
1435define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1436; SSE2-LABEL: vec256_v32i8_to_v16i16_factor2:
1437; SSE2:       # %bb.0:
1438; SSE2-NEXT:    movdqa (%rdi), %xmm0
1439; SSE2-NEXT:    paddb (%rsi), %xmm0
1440; SSE2-NEXT:    pxor %xmm1, %xmm1
1441; SSE2-NEXT:    movdqa %xmm0, %xmm2
1442; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1443; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1444; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1445; SSE2-NEXT:    paddb (%rdx), %xmm2
1446; SSE2-NEXT:    movdqa %xmm2, (%rcx)
1447; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1448; SSE2-NEXT:    retq
1449;
1450; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2:
1451; SSE42:       # %bb.0:
1452; SSE42-NEXT:    movdqa (%rdi), %xmm0
1453; SSE42-NEXT:    paddb (%rsi), %xmm0
1454; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1455; SSE42-NEXT:    pxor %xmm2, %xmm2
1456; SSE42-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1457; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1458; SSE42-NEXT:    paddb (%rdx), %xmm1
1459; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1460; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1461; SSE42-NEXT:    retq
1462;
1463; AVX-LABEL: vec256_v32i8_to_v16i16_factor2:
1464; AVX:       # %bb.0:
1465; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1466; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1467; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1468; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1469; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1470; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1471; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1472; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1473; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1474; AVX-NEXT:    retq
1475;
1476; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2:
1477; AVX2:       # %bb.0:
1478; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1479; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1480; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1481; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1482; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1483; AVX2-NEXT:    vzeroupper
1484; AVX2-NEXT:    retq
1485;
1486; AVX512F-LABEL: vec256_v32i8_to_v16i16_factor2:
1487; AVX512F:       # %bb.0:
1488; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1489; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1490; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1491; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1492; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1493; AVX512F-NEXT:    vzeroupper
1494; AVX512F-NEXT:    retq
1495;
1496; AVX512BW-LABEL: vec256_v32i8_to_v16i16_factor2:
1497; AVX512BW:       # %bb.0:
1498; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1499; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1500; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1501; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1502; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1503; AVX512BW-NEXT:    vzeroupper
1504; AVX512BW-NEXT:    retq
1505  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1506  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1507  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1508  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1509  %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 1, i32 35, i32 2, i32 37, i32 3, i32 39, i32 4, i32 41, i32 5, i32 43, i32 6, i32 45, i32 7, i32 47, i32 8, i32 49, i32 9, i32 51, i32 10, i32 53, i32 11, i32 55, i32 12, i32 57, i32 13, i32 59, i32 14, i32 61, i32 15, i32 63>
1510  %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1511  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1512  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1513  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1514  ret void
1515}
1516
1517define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1518; SSE2-LABEL: vec256_v32i8_to_v8i32_factor4:
1519; SSE2:       # %bb.0:
1520; SSE2-NEXT:    movdqa (%rdi), %xmm0
1521; SSE2-NEXT:    paddb (%rsi), %xmm0
1522; SSE2-NEXT:    pxor %xmm1, %xmm1
1523; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1524; SSE2-NEXT:    movdqa %xmm0, %xmm2
1525; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1526; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1527; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1528; SSE2-NEXT:    paddb (%rdx), %xmm2
1529; SSE2-NEXT:    movdqa %xmm2, (%rcx)
1530; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1531; SSE2-NEXT:    retq
1532;
1533; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4:
1534; SSE42:       # %bb.0:
1535; SSE42-NEXT:    movdqa (%rdi), %xmm0
1536; SSE42-NEXT:    paddb (%rsi), %xmm0
1537; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1538; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1539; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1540; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1541; SSE42-NEXT:    paddb (%rdx), %xmm1
1542; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1543; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1544; SSE42-NEXT:    retq
1545;
1546; AVX-LABEL: vec256_v32i8_to_v8i32_factor4:
1547; AVX:       # %bb.0:
1548; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1549; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1550; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1551; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1552; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1553; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1554; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1555; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1556; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1557; AVX-NEXT:    retq
1558;
1559; AVX2-LABEL: vec256_v32i8_to_v8i32_factor4:
1560; AVX2:       # %bb.0:
1561; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1562; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1563; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1564; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1565; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1566; AVX2-NEXT:    vzeroupper
1567; AVX2-NEXT:    retq
1568;
1569; AVX512F-LABEL: vec256_v32i8_to_v8i32_factor4:
1570; AVX512F:       # %bb.0:
1571; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1572; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1573; AVX512F-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1574; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1575; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1576; AVX512F-NEXT:    vzeroupper
1577; AVX512F-NEXT:    retq
1578;
1579; AVX512BW-LABEL: vec256_v32i8_to_v8i32_factor4:
1580; AVX512BW:       # %bb.0:
1581; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1582; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1583; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1584; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1585; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1586; AVX512BW-NEXT:    vzeroupper
1587; AVX512BW-NEXT:    retq
1588  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1589  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1590  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1591  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1592  %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 3, i32 45, i32 46, i32 47, i32 4, i32 49, i32 50, i32 51, i32 5, i32 53, i32 54, i32 55, i32 6, i32 57, i32 58, i32 59, i32 7, i32 61, i32 62, i32 63>
1593  %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1594  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1595  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1596  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1597  ret void
1598}
1599
1600define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1601; SSE2-LABEL: vec256_v32i8_to_v4i64_factor8:
1602; SSE2:       # %bb.0:
1603; SSE2-NEXT:    movdqa (%rdi), %xmm0
1604; SSE2-NEXT:    paddb (%rsi), %xmm0
1605; SSE2-NEXT:    pxor %xmm1, %xmm1
1606; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1607; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1608; SSE2-NEXT:    movdqa %xmm0, %xmm2
1609; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1610; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1611; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1612; SSE2-NEXT:    paddb (%rdx), %xmm2
1613; SSE2-NEXT:    movdqa %xmm2, (%rcx)
1614; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1615; SSE2-NEXT:    retq
1616;
1617; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8:
1618; SSE42:       # %bb.0:
1619; SSE42-NEXT:    movdqa (%rdi), %xmm0
1620; SSE42-NEXT:    paddb (%rsi), %xmm0
1621; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1622; SSE42-NEXT:    psrld $16, %xmm0
1623; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1624; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1625; SSE42-NEXT:    paddb (%rdx), %xmm1
1626; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1627; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1628; SSE42-NEXT:    retq
1629;
1630; AVX-LABEL: vec256_v32i8_to_v4i64_factor8:
1631; AVX:       # %bb.0:
1632; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1633; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1634; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1635; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1636; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1637; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1638; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1639; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1640; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1641; AVX-NEXT:    retq
1642;
1643; AVX2-LABEL: vec256_v32i8_to_v4i64_factor8:
1644; AVX2:       # %bb.0:
1645; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1646; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1647; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1648; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1649; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1650; AVX2-NEXT:    vzeroupper
1651; AVX2-NEXT:    retq
1652;
1653; AVX512F-LABEL: vec256_v32i8_to_v4i64_factor8:
1654; AVX512F:       # %bb.0:
1655; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1656; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1657; AVX512F-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1658; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1659; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1660; AVX512F-NEXT:    vzeroupper
1661; AVX512F-NEXT:    retq
1662;
1663; AVX512BW-LABEL: vec256_v32i8_to_v4i64_factor8:
1664; AVX512BW:       # %bb.0:
1665; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1666; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1667; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1668; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1669; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1670; AVX512BW-NEXT:    vzeroupper
1671; AVX512BW-NEXT:    retq
1672  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1673  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1674  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1675  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1676  %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 1, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 2, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 3, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1677  %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1678  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1679  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1680  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1681  ret void
1682}
1683
1684define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1685; SSE2-LABEL: vec256_v32i8_to_v2i128_factor16:
1686; SSE2:       # %bb.0:
1687; SSE2-NEXT:    movdqa (%rdi), %xmm0
1688; SSE2-NEXT:    paddb (%rsi), %xmm0
1689; SSE2-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
1690; SSE2-NEXT:    pand %xmm0, %xmm1
1691; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1692; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1693; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1694; SSE2-NEXT:    paddb (%rdx), %xmm1
1695; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1696; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1697; SSE2-NEXT:    retq
1698;
1699; SSE42-LABEL: vec256_v32i8_to_v2i128_factor16:
1700; SSE42:       # %bb.0:
1701; SSE42-NEXT:    movdqa (%rdi), %xmm0
1702; SSE42-NEXT:    paddb (%rsi), %xmm0
1703; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = [255,0]
1704; SSE42-NEXT:    pand %xmm0, %xmm1
1705; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1706; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1707; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1708; SSE42-NEXT:    paddb (%rdx), %xmm1
1709; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1710; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1711; SSE42-NEXT:    retq
1712;
1713; AVX-LABEL: vec256_v32i8_to_v2i128_factor16:
1714; AVX:       # %bb.0:
1715; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1716; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1717; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1718; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1719; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1720; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1721; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1722; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1723; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1724; AVX-NEXT:    retq
1725;
1726; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16:
1727; AVX2:       # %bb.0:
1728; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1729; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1730; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1731; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1732; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1733; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1734; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1735; AVX2-NEXT:    vzeroupper
1736; AVX2-NEXT:    retq
1737;
1738; AVX512F-LABEL: vec256_v32i8_to_v2i128_factor16:
1739; AVX512F:       # %bb.0:
1740; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1741; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1742; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1743; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1744; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1745; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1746; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1747; AVX512F-NEXT:    vzeroupper
1748; AVX512F-NEXT:    retq
1749;
1750; AVX512BW-LABEL: vec256_v32i8_to_v2i128_factor16:
1751; AVX512BW:       # %bb.0:
1752; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1753; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1754; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1755; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1756; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1757; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1758; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1759; AVX512BW-NEXT:    vzeroupper
1760; AVX512BW-NEXT:    retq
1761  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1762  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1763  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1764  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1765  %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 1, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1766  %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1767  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1768  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1769  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1770  ret void
1771}
1772
1773define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1774; SSE-LABEL: vec256_v32i8_to_v1i256_factor32:
1775; SSE:       # %bb.0:
1776; SSE-NEXT:    movdqa (%rdi), %xmm0
1777; SSE-NEXT:    paddb (%rsi), %xmm0
1778; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1779; SSE-NEXT:    movaps 16(%rdx), %xmm1
1780; SSE-NEXT:    paddb (%rdx), %xmm0
1781; SSE-NEXT:    movaps %xmm1, 16(%rcx)
1782; SSE-NEXT:    movdqa %xmm0, (%rcx)
1783; SSE-NEXT:    retq
1784;
1785; AVX-LABEL: vec256_v32i8_to_v1i256_factor32:
1786; AVX:       # %bb.0:
1787; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1788; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1789; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1790; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1791; AVX-NEXT:    vmovaps 16(%rdx), %xmm1
1792; AVX-NEXT:    vmovaps %xmm1, 16(%rcx)
1793; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1794; AVX-NEXT:    retq
1795;
1796; AVX2-LABEL: vec256_v32i8_to_v1i256_factor32:
1797; AVX2:       # %bb.0:
1798; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1799; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1800; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
1801; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1802; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1803; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1804; AVX2-NEXT:    vzeroupper
1805; AVX2-NEXT:    retq
1806;
1807; AVX512F-LABEL: vec256_v32i8_to_v1i256_factor32:
1808; AVX512F:       # %bb.0:
1809; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1810; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1811; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
1812; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
1813; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1814; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1815; AVX512F-NEXT:    vzeroupper
1816; AVX512F-NEXT:    retq
1817;
1818; AVX512BW-LABEL: vec256_v32i8_to_v1i256_factor32:
1819; AVX512BW:       # %bb.0:
1820; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1821; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1822; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
1823; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
1824; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1825; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1826; AVX512BW-NEXT:    vzeroupper
1827; AVX512BW-NEXT:    retq
1828  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1829  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1830  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1831  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1832  %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1833  %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1834  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1835  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1836  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1837  ret void
1838}
1839
1840define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1841; SSE2-LABEL: vec256_v16i16_to_v8i32_factor2:
1842; SSE2:       # %bb.0:
1843; SSE2-NEXT:    movdqa (%rdi), %xmm0
1844; SSE2-NEXT:    paddb (%rsi), %xmm0
1845; SSE2-NEXT:    pxor %xmm1, %xmm1
1846; SSE2-NEXT:    movdqa %xmm0, %xmm2
1847; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1848; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1849; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1850; SSE2-NEXT:    paddb (%rdx), %xmm2
1851; SSE2-NEXT:    movdqa %xmm2, (%rcx)
1852; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1853; SSE2-NEXT:    retq
1854;
1855; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2:
1856; SSE42:       # %bb.0:
1857; SSE42-NEXT:    movdqa (%rdi), %xmm0
1858; SSE42-NEXT:    paddb (%rsi), %xmm0
1859; SSE42-NEXT:    pxor %xmm1, %xmm1
1860; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1861; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1862; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1863; SSE42-NEXT:    paddb (%rdx), %xmm2
1864; SSE42-NEXT:    movdqa %xmm2, (%rcx)
1865; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1866; SSE42-NEXT:    retq
1867;
1868; AVX-LABEL: vec256_v16i16_to_v8i32_factor2:
1869; AVX:       # %bb.0:
1870; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1871; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1872; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1873; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1874; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1875; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1876; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1877; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1878; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1879; AVX-NEXT:    retq
1880;
1881; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2:
1882; AVX2:       # %bb.0:
1883; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1884; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1885; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1886; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1887; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1888; AVX2-NEXT:    vzeroupper
1889; AVX2-NEXT:    retq
1890;
1891; AVX512F-LABEL: vec256_v16i16_to_v8i32_factor2:
1892; AVX512F:       # %bb.0:
1893; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1894; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1895; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1896; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1897; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1898; AVX512F-NEXT:    vzeroupper
1899; AVX512F-NEXT:    retq
1900;
1901; AVX512BW-LABEL: vec256_v16i16_to_v8i32_factor2:
1902; AVX512BW:       # %bb.0:
1903; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1904; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1905; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1906; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1907; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1908; AVX512BW-NEXT:    vzeroupper
1909; AVX512BW-NEXT:    retq
1910  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1911  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1912  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1913  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1914  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
1915  %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1916  %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
1917  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1918  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1919  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1920  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1921  ret void
1922}
1923
1924define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1925; SSE2-LABEL: vec256_v16i16_to_v4i64_factor4:
1926; SSE2:       # %bb.0:
1927; SSE2-NEXT:    movdqa (%rdi), %xmm0
1928; SSE2-NEXT:    paddb (%rsi), %xmm0
1929; SSE2-NEXT:    pxor %xmm1, %xmm1
1930; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1931; SSE2-NEXT:    movdqa %xmm0, %xmm2
1932; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1933; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1934; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1935; SSE2-NEXT:    paddb (%rdx), %xmm2
1936; SSE2-NEXT:    movdqa %xmm2, (%rcx)
1937; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1938; SSE2-NEXT:    retq
1939;
1940; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4:
1941; SSE42:       # %bb.0:
1942; SSE42-NEXT:    movdqa (%rdi), %xmm0
1943; SSE42-NEXT:    paddb (%rsi), %xmm0
1944; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1945; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1946; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1947; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1948; SSE42-NEXT:    paddb (%rdx), %xmm1
1949; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1950; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1951; SSE42-NEXT:    retq
1952;
1953; AVX-LABEL: vec256_v16i16_to_v4i64_factor4:
1954; AVX:       # %bb.0:
1955; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1956; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1957; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1958; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1959; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1960; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1961; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1962; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1963; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1964; AVX-NEXT:    retq
1965;
1966; AVX2-LABEL: vec256_v16i16_to_v4i64_factor4:
1967; AVX2:       # %bb.0:
1968; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1969; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1970; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1971; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1972; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1973; AVX2-NEXT:    vzeroupper
1974; AVX2-NEXT:    retq
1975;
1976; AVX512F-LABEL: vec256_v16i16_to_v4i64_factor4:
1977; AVX512F:       # %bb.0:
1978; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1979; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1980; AVX512F-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1981; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1982; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1983; AVX512F-NEXT:    vzeroupper
1984; AVX512F-NEXT:    retq
1985;
1986; AVX512BW-LABEL: vec256_v16i16_to_v4i64_factor4:
1987; AVX512BW:       # %bb.0:
1988; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1989; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1990; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1991; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1992; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1993; AVX512BW-NEXT:    vzeroupper
1994; AVX512BW-NEXT:    retq
1995  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1996  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1997  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1998  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1999  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
2000  %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
2001  %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
2002  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2003  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2004  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2005  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2006  ret void
2007}
2008
2009define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2010; SSE2-LABEL: vec256_v16i16_to_v2i128_factor8:
2011; SSE2:       # %bb.0:
2012; SSE2-NEXT:    movdqa (%rdi), %xmm0
2013; SSE2-NEXT:    paddb (%rsi), %xmm0
2014; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
2015; SSE2-NEXT:    pand %xmm0, %xmm1
2016; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2017; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2018; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2019; SSE2-NEXT:    paddb (%rdx), %xmm1
2020; SSE2-NEXT:    movdqa %xmm1, (%rcx)
2021; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2022; SSE2-NEXT:    retq
2023;
2024; SSE42-LABEL: vec256_v16i16_to_v2i128_factor8:
2025; SSE42:       # %bb.0:
2026; SSE42-NEXT:    movdqa (%rdi), %xmm0
2027; SSE42-NEXT:    paddb (%rsi), %xmm0
2028; SSE42-NEXT:    pxor %xmm1, %xmm1
2029; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2030; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2031; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2032; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2033; SSE42-NEXT:    paddb (%rdx), %xmm1
2034; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2035; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2036; SSE42-NEXT:    retq
2037;
2038; AVX-LABEL: vec256_v16i16_to_v2i128_factor8:
2039; AVX:       # %bb.0:
2040; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2041; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2042; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2043; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2044; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2045; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2046; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2047; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2048; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2049; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2050; AVX-NEXT:    retq
2051;
2052; AVX2-LABEL: vec256_v16i16_to_v2i128_factor8:
2053; AVX2:       # %bb.0:
2054; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2055; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2056; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2057; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2058; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2059; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
2060; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2061; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2062; AVX2-NEXT:    vzeroupper
2063; AVX2-NEXT:    retq
2064;
2065; AVX512F-LABEL: vec256_v16i16_to_v2i128_factor8:
2066; AVX512F:       # %bb.0:
2067; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2068; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2069; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2070; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2071; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2072; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
2073; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2074; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2075; AVX512F-NEXT:    vzeroupper
2076; AVX512F-NEXT:    retq
2077;
2078; AVX512BW-LABEL: vec256_v16i16_to_v2i128_factor8:
2079; AVX512BW:       # %bb.0:
2080; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2081; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2082; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
2083; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2084; AVX512BW-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
2085; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
2086; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2087; AVX512BW-NEXT:    vzeroupper
2088; AVX512BW-NEXT:    retq
2089  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2090  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2091  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2092  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2093  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
2094  %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2095  %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
2096  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2097  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2098  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2099  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2100  ret void
2101}
2102
2103define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2104; SSE2-LABEL: vec256_v16i16_to_v1i256_factor16:
2105; SSE2:       # %bb.0:
2106; SSE2-NEXT:    movdqa (%rdi), %xmm0
2107; SSE2-NEXT:    paddb (%rsi), %xmm0
2108; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2109; SSE2-NEXT:    movaps 16(%rdx), %xmm1
2110; SSE2-NEXT:    paddb (%rdx), %xmm0
2111; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
2112; SSE2-NEXT:    movdqa %xmm0, (%rcx)
2113; SSE2-NEXT:    retq
2114;
2115; SSE42-LABEL: vec256_v16i16_to_v1i256_factor16:
2116; SSE42:       # %bb.0:
2117; SSE42-NEXT:    movdqa (%rdi), %xmm0
2118; SSE42-NEXT:    paddb (%rsi), %xmm0
2119; SSE42-NEXT:    pxor %xmm1, %xmm1
2120; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2121; SSE42-NEXT:    movaps 16(%rdx), %xmm0
2122; SSE42-NEXT:    paddb (%rdx), %xmm1
2123; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
2124; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2125; SSE42-NEXT:    retq
2126;
2127; AVX-LABEL: vec256_v16i16_to_v1i256_factor16:
2128; AVX:       # %bb.0:
2129; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2130; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2131; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2132; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2133; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2134; AVX-NEXT:    vmovaps 16(%rdx), %xmm1
2135; AVX-NEXT:    vmovaps %xmm1, 16(%rcx)
2136; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2137; AVX-NEXT:    retq
2138;
2139; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16:
2140; AVX2:       # %bb.0:
2141; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2142; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2143; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
2144; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2145; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2146; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2147; AVX2-NEXT:    vzeroupper
2148; AVX2-NEXT:    retq
2149;
2150; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16:
2151; AVX512F:       # %bb.0:
2152; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2153; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2154; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
2155; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
2156; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2157; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2158; AVX512F-NEXT:    vzeroupper
2159; AVX512F-NEXT:    retq
2160;
2161; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16:
2162; AVX512BW:       # %bb.0:
2163; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2164; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2165; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
2166; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
2167; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2168; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2169; AVX512BW-NEXT:    vzeroupper
2170; AVX512BW-NEXT:    retq
2171  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2172  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2173  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2174  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2175  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
2176  %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2177  %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
2178  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2179  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2180  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2181  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2182  ret void
2183}
2184
2185define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2186; SSE2-LABEL: vec256_v8i32_to_v4i64_factor2:
2187; SSE2:       # %bb.0:
2188; SSE2-NEXT:    movdqa (%rdi), %xmm0
2189; SSE2-NEXT:    paddb (%rsi), %xmm0
2190; SSE2-NEXT:    pxor %xmm1, %xmm1
2191; SSE2-NEXT:    movdqa %xmm0, %xmm2
2192; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2193; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2194; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2195; SSE2-NEXT:    paddb (%rdx), %xmm2
2196; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2197; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2198; SSE2-NEXT:    retq
2199;
2200; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2:
2201; SSE42:       # %bb.0:
2202; SSE42-NEXT:    movdqa (%rdi), %xmm0
2203; SSE42-NEXT:    paddb (%rsi), %xmm0
2204; SSE42-NEXT:    pxor %xmm1, %xmm1
2205; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
2206; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2207; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2208; SSE42-NEXT:    paddb (%rdx), %xmm2
2209; SSE42-NEXT:    movdqa %xmm2, (%rcx)
2210; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2211; SSE42-NEXT:    retq
2212;
2213; AVX-LABEL: vec256_v8i32_to_v4i64_factor2:
2214; AVX:       # %bb.0:
2215; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2216; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2217; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
2218; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2219; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2220; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2221; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2222; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2223; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2224; AVX-NEXT:    retq
2225;
2226; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2:
2227; AVX2:       # %bb.0:
2228; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2229; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2230; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2231; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2232; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2233; AVX2-NEXT:    vzeroupper
2234; AVX2-NEXT:    retq
2235;
2236; AVX512F-LABEL: vec256_v8i32_to_v4i64_factor2:
2237; AVX512F:       # %bb.0:
2238; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2239; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2240; AVX512F-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2241; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2242; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2243; AVX512F-NEXT:    vzeroupper
2244; AVX512F-NEXT:    retq
2245;
2246; AVX512BW-LABEL: vec256_v8i32_to_v4i64_factor2:
2247; AVX512BW:       # %bb.0:
2248; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
2249; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2250; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2251; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2252; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2253; AVX512BW-NEXT:    vzeroupper
2254; AVX512BW-NEXT:    retq
2255  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2256  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2257  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2258  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2259  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32>
2260  %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
2261  %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8>
2262  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2263  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2264  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2265  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2266  ret void
2267}
2268
2269define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2270; SSE2-LABEL: vec256_v8i32_to_v2i128_factor4:
2271; SSE2:       # %bb.0:
2272; SSE2-NEXT:    movdqa (%rdi), %xmm0
2273; SSE2-NEXT:    paddb (%rsi), %xmm0
2274; SSE2-NEXT:    xorps %xmm1, %xmm1
2275; SSE2-NEXT:    xorps %xmm2, %xmm2
2276; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2277; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
2278; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
2279; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2280; SSE2-NEXT:    paddb (%rdx), %xmm2
2281; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2282; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2283; SSE2-NEXT:    retq
2284;
2285; SSE42-LABEL: vec256_v8i32_to_v2i128_factor4:
2286; SSE42:       # %bb.0:
2287; SSE42-NEXT:    movdqa (%rdi), %xmm0
2288; SSE42-NEXT:    paddb (%rsi), %xmm0
2289; SSE42-NEXT:    pxor %xmm1, %xmm1
2290; SSE42-NEXT:    pxor %xmm2, %xmm2
2291; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2292; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2293; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2294; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2295; SSE42-NEXT:    paddb (%rdx), %xmm2
2296; SSE42-NEXT:    movdqa %xmm2, (%rcx)
2297; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2298; SSE42-NEXT:    retq
2299;
2300; AVX-LABEL: vec256_v8i32_to_v2i128_factor4:
2301; AVX:       # %bb.0:
2302; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2303; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2304; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
2305; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2306; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2307; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2308; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
2309; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
2310; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2311; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2312; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
2313; AVX-NEXT:    vzeroupper
2314; AVX-NEXT:    retq
2315;
2316; AVX2-SLOW-LABEL: vec256_v8i32_to_v2i128_factor4:
2317; AVX2-SLOW:       # %bb.0:
2318; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
2319; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2320; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2321; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2322; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2323; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2324; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2325; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2326; AVX2-SLOW-NEXT:    vzeroupper
2327; AVX2-SLOW-NEXT:    retq
2328;
2329; AVX2-FAST-PERLANE-LABEL: vec256_v8i32_to_v2i128_factor4:
2330; AVX2-FAST-PERLANE:       # %bb.0:
2331; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
2332; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2333; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2334; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2335; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2336; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2337; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2338; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
2339; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2340; AVX2-FAST-PERLANE-NEXT:    retq
2341;
2342; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4:
2343; AVX2-FAST:       # %bb.0:
2344; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2345; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2346; AVX2-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0]
2347; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2348; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2349; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2350; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2351; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2352; AVX2-FAST-NEXT:    vzeroupper
2353; AVX2-FAST-NEXT:    retq
2354;
2355; AVX512F-LABEL: vec256_v8i32_to_v2i128_factor4:
2356; AVX512F:       # %bb.0:
2357; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2358; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2359; AVX512F-NEXT:    movb $17, %al
2360; AVX512F-NEXT:    kmovw %eax, %k1
2361; AVX512F-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
2362; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2363; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2364; AVX512F-NEXT:    vzeroupper
2365; AVX512F-NEXT:    retq
2366;
2367; AVX512BW-LABEL: vec256_v8i32_to_v2i128_factor4:
2368; AVX512BW:       # %bb.0:
2369; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2370; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2371; AVX512BW-NEXT:    movb $17, %al
2372; AVX512BW-NEXT:    kmovd %eax, %k1
2373; AVX512BW-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
2374; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2375; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2376; AVX512BW-NEXT:    vzeroupper
2377; AVX512BW-NEXT:    retq
2378  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2379  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2380  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2381  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2382  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32>
2383  %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
2384  %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8>
2385  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2386  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2387  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2388  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2389  ret void
2390}
2391
2392define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2393; SSE2-LABEL: vec256_v8i32_to_v1i256_factor8:
2394; SSE2:       # %bb.0:
2395; SSE2-NEXT:    movdqa (%rdi), %xmm0
2396; SSE2-NEXT:    paddb (%rsi), %xmm0
2397; SSE2-NEXT:    xorps %xmm1, %xmm1
2398; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2399; SSE2-NEXT:    movaps 16(%rdx), %xmm0
2400; SSE2-NEXT:    paddb (%rdx), %xmm1
2401; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
2402; SSE2-NEXT:    movdqa %xmm1, (%rcx)
2403; SSE2-NEXT:    retq
2404;
2405; SSE42-LABEL: vec256_v8i32_to_v1i256_factor8:
2406; SSE42:       # %bb.0:
2407; SSE42-NEXT:    movdqa (%rdi), %xmm0
2408; SSE42-NEXT:    paddb (%rsi), %xmm0
2409; SSE42-NEXT:    pxor %xmm1, %xmm1
2410; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2411; SSE42-NEXT:    movaps 16(%rdx), %xmm0
2412; SSE42-NEXT:    paddb (%rdx), %xmm1
2413; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
2414; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2415; SSE42-NEXT:    retq
2416;
2417; AVX-LABEL: vec256_v8i32_to_v1i256_factor8:
2418; AVX:       # %bb.0:
2419; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2420; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2421; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2422; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2423; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2424; AVX-NEXT:    vmovaps 16(%rdx), %xmm1
2425; AVX-NEXT:    vmovaps %xmm1, 16(%rcx)
2426; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2427; AVX-NEXT:    retq
2428;
2429; AVX2-LABEL: vec256_v8i32_to_v1i256_factor8:
2430; AVX2:       # %bb.0:
2431; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2432; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2433; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2434; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2435; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2436; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2437; AVX2-NEXT:    vzeroupper
2438; AVX2-NEXT:    retq
2439;
2440; AVX512F-LABEL: vec256_v8i32_to_v1i256_factor8:
2441; AVX512F:       # %bb.0:
2442; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2443; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2444; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2445; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2446; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2447; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2448; AVX512F-NEXT:    vzeroupper
2449; AVX512F-NEXT:    retq
2450;
2451; AVX512BW-LABEL: vec256_v8i32_to_v1i256_factor8:
2452; AVX512BW:       # %bb.0:
2453; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2454; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2455; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2456; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2457; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2458; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2459; AVX512BW-NEXT:    vzeroupper
2460; AVX512BW-NEXT:    retq
2461  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2462  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2463  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2464  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2465  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32>
2466  %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2467  %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8>
2468  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2469  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2470  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2471  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2472  ret void
2473}
2474
2475define void @vec256_v4i64_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2476; SSE-LABEL: vec256_v4i64_to_v2i128_factor2:
2477; SSE:       # %bb.0:
2478; SSE-NEXT:    movdqa (%rdi), %xmm0
2479; SSE-NEXT:    paddb (%rsi), %xmm0
2480; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
2481; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
2482; SSE-NEXT:    paddb 16(%rdx), %xmm0
2483; SSE-NEXT:    paddb (%rdx), %xmm1
2484; SSE-NEXT:    movdqa %xmm1, (%rcx)
2485; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
2486; SSE-NEXT:    retq
2487;
2488; AVX-LABEL: vec256_v4i64_to_v2i128_factor2:
2489; AVX:       # %bb.0:
2490; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2491; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2492; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2493; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
2494; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
2495; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
2496; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
2497; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2498; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2499; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
2500; AVX-NEXT:    vzeroupper
2501; AVX-NEXT:    retq
2502;
2503; AVX2-LABEL: vec256_v4i64_to_v2i128_factor2:
2504; AVX2:       # %bb.0:
2505; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2506; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2507; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2508; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2509; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2510; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2511; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2512; AVX2-NEXT:    vzeroupper
2513; AVX2-NEXT:    retq
2514;
2515; AVX512F-LABEL: vec256_v4i64_to_v2i128_factor2:
2516; AVX512F:       # %bb.0:
2517; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2518; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2519; AVX512F-NEXT:    movb $5, %al
2520; AVX512F-NEXT:    kmovw %eax, %k1
2521; AVX512F-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
2522; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2523; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2524; AVX512F-NEXT:    vzeroupper
2525; AVX512F-NEXT:    retq
2526;
2527; AVX512BW-LABEL: vec256_v4i64_to_v2i128_factor2:
2528; AVX512BW:       # %bb.0:
2529; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2530; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2531; AVX512BW-NEXT:    movb $5, %al
2532; AVX512BW-NEXT:    kmovd %eax, %k1
2533; AVX512BW-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
2534; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2535; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2536; AVX512BW-NEXT:    vzeroupper
2537; AVX512BW-NEXT:    retq
2538  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2539  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2540  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2541  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2542  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64>
2543  %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2544  %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8>
2545  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2546  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2547  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2548  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2549  ret void
2550}
2551
2552define void @vec256_v4i64_to_v1i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2553; SSE-LABEL: vec256_v4i64_to_v1i256_factor4:
2554; SSE:       # %bb.0:
2555; SSE-NEXT:    movdqa (%rdi), %xmm0
2556; SSE-NEXT:    paddb (%rsi), %xmm0
2557; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
2558; SSE-NEXT:    movaps 16(%rdx), %xmm1
2559; SSE-NEXT:    paddb (%rdx), %xmm0
2560; SSE-NEXT:    movaps %xmm1, 16(%rcx)
2561; SSE-NEXT:    movdqa %xmm0, (%rcx)
2562; SSE-NEXT:    retq
2563;
2564; AVX-LABEL: vec256_v4i64_to_v1i256_factor4:
2565; AVX:       # %bb.0:
2566; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2567; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2568; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2569; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2570; AVX-NEXT:    vmovaps 16(%rdx), %xmm1
2571; AVX-NEXT:    vmovaps %xmm1, 16(%rcx)
2572; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2573; AVX-NEXT:    retq
2574;
2575; AVX2-LABEL: vec256_v4i64_to_v1i256_factor4:
2576; AVX2:       # %bb.0:
2577; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2578; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2579; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2580; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2581; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2582; AVX2-NEXT:    vzeroupper
2583; AVX2-NEXT:    retq
2584;
2585; AVX512F-LABEL: vec256_v4i64_to_v1i256_factor4:
2586; AVX512F:       # %bb.0:
2587; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2588; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2589; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2590; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2591; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2592; AVX512F-NEXT:    vzeroupper
2593; AVX512F-NEXT:    retq
2594;
2595; AVX512BW-LABEL: vec256_v4i64_to_v1i256_factor4:
2596; AVX512BW:       # %bb.0:
2597; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2598; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2599; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2600; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2601; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2602; AVX512BW-NEXT:    vzeroupper
2603; AVX512BW-NEXT:    retq
2604  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2605  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2606  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2607  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2608  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64>
2609  %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2610  %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8>
2611  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2612  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2613  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2614  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2615  ret void
2616}
2617
2618define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2619; SSE-LABEL: vec256_v2i128_to_v1i256_factor2:
2620; SSE:       # %bb.0:
2621; SSE-NEXT:    movdqa (%rdi), %xmm0
2622; SSE-NEXT:    paddb (%rsi), %xmm0
2623; SSE-NEXT:    movaps 16(%rdx), %xmm1
2624; SSE-NEXT:    paddb (%rdx), %xmm0
2625; SSE-NEXT:    movaps %xmm1, 16(%rcx)
2626; SSE-NEXT:    movdqa %xmm0, (%rcx)
2627; SSE-NEXT:    retq
2628;
2629; AVX-LABEL: vec256_v2i128_to_v1i256_factor2:
2630; AVX:       # %bb.0:
2631; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2632; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2633; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2634; AVX-NEXT:    vmovaps 16(%rdx), %xmm1
2635; AVX-NEXT:    vmovaps %xmm1, 16(%rcx)
2636; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2637; AVX-NEXT:    retq
2638;
2639; AVX2-LABEL: vec256_v2i128_to_v1i256_factor2:
2640; AVX2:       # %bb.0:
2641; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2642; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2643; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2644; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2645; AVX2-NEXT:    vzeroupper
2646; AVX2-NEXT:    retq
2647;
2648; AVX512F-LABEL: vec256_v2i128_to_v1i256_factor2:
2649; AVX512F:       # %bb.0:
2650; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2651; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2652; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2653; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
2654; AVX512F-NEXT:    vzeroupper
2655; AVX512F-NEXT:    retq
2656;
2657; AVX512BW-LABEL: vec256_v2i128_to_v1i256_factor2:
2658; AVX512BW:       # %bb.0:
2659; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
2660; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2661; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2662; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2663; AVX512BW-NEXT:    vzeroupper
2664; AVX512BW-NEXT:    retq
2665  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2666  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2667  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2668  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2669  %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <2 x i128>
2670  %zextd.vec = shufflevector <2 x i128> %in.vec.cast, <2 x i128> zeroinitializer, <2 x i32> <i32 0, i32 3>
2671  %out.bytevec = bitcast <2 x i128> %zextd.vec to <32 x i8>
2672  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2673  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2674  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2675  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2676  ret void
2677}
2678
2679define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2680; SSE2-LABEL: vec384_v48i8_to_v24i16_factor2:
2681; SSE2:       # %bb.0:
2682; SSE2-NEXT:    movdqa (%rdi), %xmm0
2683; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
2684; SSE2-NEXT:    paddb (%rsi), %xmm0
2685; SSE2-NEXT:    paddb 16(%rsi), %xmm1
2686; SSE2-NEXT:    pxor %xmm2, %xmm2
2687; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2688; SSE2-NEXT:    movdqa %xmm0, %xmm3
2689; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2690; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2691; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2692; SSE2-NEXT:    paddb (%rdx), %xmm3
2693; SSE2-NEXT:    paddb 32(%rdx), %xmm1
2694; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
2695; SSE2-NEXT:    movdqa %xmm3, (%rcx)
2696; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2697; SSE2-NEXT:    retq
2698;
2699; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2:
2700; SSE42:       # %bb.0:
2701; SSE42-NEXT:    movdqa (%rdi), %xmm0
2702; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
2703; SSE42-NEXT:    paddb (%rsi), %xmm0
2704; SSE42-NEXT:    paddb 16(%rsi), %xmm1
2705; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2706; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2707; SSE42-NEXT:    pxor %xmm3, %xmm3
2708; SSE42-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2709; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2710; SSE42-NEXT:    paddb (%rdx), %xmm2
2711; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2712; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2713; SSE42-NEXT:    movdqa %xmm2, (%rcx)
2714; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2715; SSE42-NEXT:    retq
2716;
2717; AVX-LABEL: vec384_v48i8_to_v24i16_factor2:
2718; AVX:       # %bb.0:
2719; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2720; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
2721; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
2722; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2723; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2724; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2725; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2726; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2727; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
2728; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2729; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
2730; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
2731; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2732; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
2733; AVX-NEXT:    retq
2734;
2735; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2:
2736; AVX2:       # %bb.0:
2737; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2738; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2739; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2740; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2741; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2742; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2743; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2744; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2745; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2746; AVX2-NEXT:    vzeroupper
2747; AVX2-NEXT:    retq
2748;
2749; AVX512F-LABEL: vec384_v48i8_to_v24i16_factor2:
2750; AVX512F:       # %bb.0:
2751; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2752; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2753; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2754; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
2755; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2756; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2757; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2758; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2759; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2760; AVX512F-NEXT:    vzeroupper
2761; AVX512F-NEXT:    retq
2762;
2763; AVX512BW-LABEL: vec384_v48i8_to_v24i16_factor2:
2764; AVX512BW:       # %bb.0:
2765; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2766; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2767; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2768; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2769; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2770; AVX512BW-NEXT:    vzeroupper
2771; AVX512BW-NEXT:    retq
2772  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2773  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2774  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2775  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
2776  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 1, i32 51, i32 2, i32 53, i32 3, i32 55, i32 4, i32 57, i32 5, i32 59, i32 6, i32 61, i32 7, i32 63, i32 8, i32 65, i32 9, i32 67, i32 10, i32 69, i32 11, i32 71, i32 12, i32 73, i32 13, i32 75, i32 14, i32 77, i32 15, i32 79, i32 16, i32 81, i32 17, i32 83, i32 18, i32 85, i32 19, i32 87, i32 20, i32 89, i32 21, i32 91, i32 22, i32 93, i32 23, i32 95>
2777  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2778  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2779  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2780  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2781  ret void
2782}
2783
2784define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2785; SSE2-LABEL: vec384_v48i8_to_v16i24_factor3:
2786; SSE2:       # %bb.0:
2787; SSE2-NEXT:    movdqa (%rdi), %xmm0
2788; SSE2-NEXT:    paddb (%rsi), %xmm0
2789; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
2790; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5]
2791; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2792; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2793; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7]
2794; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
2795; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2796; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2797; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2798; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6]
2799; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2800; SSE2-NEXT:    paddb (%rdx), %xmm0
2801; SSE2-NEXT:    paddb 32(%rdx), %xmm2
2802; SSE2-NEXT:    paddb 16(%rdx), %xmm1
2803; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2804; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
2805; SSE2-NEXT:    movdqa %xmm0, (%rcx)
2806; SSE2-NEXT:    retq
2807;
2808; SSE42-LABEL: vec384_v48i8_to_v16i24_factor3:
2809; SSE42:       # %bb.0:
2810; SSE42-NEXT:    movdqa (%rdi), %xmm0
2811; SSE42-NEXT:    paddb (%rsi), %xmm0
2812; SSE42-NEXT:    movdqa %xmm0, %xmm1
2813; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero
2814; SSE42-NEXT:    movdqa %xmm0, %xmm2
2815; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5]
2816; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero
2817; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2818; SSE42-NEXT:    paddb (%rdx), %xmm2
2819; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2820; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2821; SSE42-NEXT:    movdqa %xmm2, (%rcx)
2822; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2823; SSE42-NEXT:    retq
2824;
2825; AVX-LABEL: vec384_v48i8_to_v16i24_factor3:
2826; AVX:       # %bb.0:
2827; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2828; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2829; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5]
2830; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero
2831; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2832; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
2833; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
2834; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2835; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2836; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
2837; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
2838; AVX-NEXT:    retq
2839;
2840; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3:
2841; AVX2:       # %bb.0:
2842; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2843; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2844; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
2845; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero
2846; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2847; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2848; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2849; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2850; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2851; AVX2-NEXT:    vzeroupper
2852; AVX2-NEXT:    retq
2853;
2854; AVX512F-LABEL: vec384_v48i8_to_v16i24_factor3:
2855; AVX512F:       # %bb.0:
2856; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
2857; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2858; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
2859; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero
2860; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2861; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2862; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2863; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2864; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2865; AVX512F-NEXT:    vzeroupper
2866; AVX512F-NEXT:    retq
2867;
2868; AVX512BW-LABEL: vec384_v48i8_to_v16i24_factor3:
2869; AVX512BW:       # %bb.0:
2870; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2871; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2872; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2,0,3,3,0,4,4,0,5]
2873; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm1
2874; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2875; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2876; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2877; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2878; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2879; AVX512BW-NEXT:    vzeroupper
2880; AVX512BW-NEXT:    retq
2881  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2882  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2883  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2884  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
2885  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 1, i32 52, i32 53, i32 2, i32 55, i32 56, i32 3, i32 58, i32 59, i32 4, i32 61, i32 62, i32 5, i32 64, i32 65, i32 6, i32 67, i32 68, i32 7, i32 70, i32 71, i32 8, i32 73, i32 74, i32 9, i32 76, i32 77, i32 10, i32 79, i32 80, i32 11, i32 82, i32 83, i32 12, i32 85, i32 86, i32 13, i32 88, i32 89, i32 14, i32 91, i32 92, i32 15, i32 94, i32 95>
2886  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2887  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2888  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2889  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2890  ret void
2891}
2892
2893define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2894; SSE2-LABEL: vec384_v48i8_to_v12i32_factor4:
2895; SSE2:       # %bb.0:
2896; SSE2-NEXT:    movdqa (%rdi), %xmm0
2897; SSE2-NEXT:    paddb (%rsi), %xmm0
2898; SSE2-NEXT:    pxor %xmm1, %xmm1
2899; SSE2-NEXT:    movdqa %xmm0, %xmm2
2900; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2901; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2902; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2903; SSE2-NEXT:    movdqa %xmm0, %xmm3
2904; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2905; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2906; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2907; SSE2-NEXT:    paddb (%rdx), %xmm3
2908; SSE2-NEXT:    paddb 32(%rdx), %xmm2
2909; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
2910; SSE2-NEXT:    movdqa %xmm3, (%rcx)
2911; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2912; SSE2-NEXT:    retq
2913;
2914; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4:
2915; SSE42:       # %bb.0:
2916; SSE42-NEXT:    movdqa (%rdi), %xmm0
2917; SSE42-NEXT:    paddb (%rsi), %xmm0
2918; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2919; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2920; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
2921; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2922; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2923; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2924; SSE42-NEXT:    paddb 32(%rdx), %xmm2
2925; SSE42-NEXT:    paddb (%rdx), %xmm1
2926; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2927; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
2928; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2929; SSE42-NEXT:    retq
2930;
2931; AVX-LABEL: vec384_v48i8_to_v12i32_factor4:
2932; AVX:       # %bb.0:
2933; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2934; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2935; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2936; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
2937; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
2938; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2939; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2940; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
2941; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
2942; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2943; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2944; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
2945; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
2946; AVX-NEXT:    retq
2947;
2948; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4:
2949; AVX2-SLOW:       # %bb.0:
2950; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
2951; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2952; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2953; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2954; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2955; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2956; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2957; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
2958; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
2959; AVX2-SLOW-NEXT:    vzeroupper
2960; AVX2-SLOW-NEXT:    retq
2961;
2962; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4:
2963; AVX2-FAST-PERLANE:       # %bb.0:
2964; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
2965; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2966; AVX2-FAST-PERLANE-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2967; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
2968; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2969; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2970; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
2971; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
2972; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2973; AVX2-FAST-PERLANE-NEXT:    retq
2974;
2975; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4:
2976; AVX2-FAST:       # %bb.0:
2977; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
2978; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2979; AVX2-FAST-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2980; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
2981; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2982; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2983; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
2984; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
2985; AVX2-FAST-NEXT:    vzeroupper
2986; AVX2-FAST-NEXT:    retq
2987;
2988; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4:
2989; AVX512F:       # %bb.0:
2990; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2991; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2992; AVX512F-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2993; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
2994; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2995; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2996; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2997; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2998; AVX512F-NEXT:    vzeroupper
2999; AVX512F-NEXT:    retq
3000;
3001; AVX512BW-LABEL: vec384_v48i8_to_v12i32_factor4:
3002; AVX512BW:       # %bb.0:
3003; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
3004; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3005; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3006; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3007; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3008; AVX512BW-NEXT:    vzeroupper
3009; AVX512BW-NEXT:    retq
3010  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3011  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3012  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3013  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3014  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 1, i32 53, i32 54, i32 55, i32 2, i32 57, i32 58, i32 59, i32 3, i32 61, i32 62, i32 63, i32 4, i32 65, i32 66, i32 67, i32 5, i32 69, i32 70, i32 71, i32 6, i32 73, i32 74, i32 75, i32 7, i32 77, i32 78, i32 79, i32 8, i32 81, i32 82, i32 83, i32 9, i32 85, i32 86, i32 87, i32 10, i32 89, i32 90, i32 91, i32 11, i32 93, i32 94, i32 95>
3015  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3016  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3017  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3018  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3019  ret void
3020}
3021
3022define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3023; SSE2-LABEL: vec384_v48i8_to_v8i48_factor6:
3024; SSE2:       # %bb.0:
3025; SSE2-NEXT:    movdqa (%rdi), %xmm0
3026; SSE2-NEXT:    paddb (%rsi), %xmm0
3027; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3028; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3029; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3030; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3031; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
3032; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3033; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3034; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3035; SSE2-NEXT:    paddb (%rdx), %xmm2
3036; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3037; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3038; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3039; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3040; SSE2-NEXT:    retq
3041;
3042; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6:
3043; SSE42:       # %bb.0:
3044; SSE42-NEXT:    movdqa (%rdi), %xmm0
3045; SSE42-NEXT:    paddb (%rsi), %xmm0
3046; SSE42-NEXT:    movdqa %xmm0, %xmm1
3047; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero
3048; SSE42-NEXT:    movdqa %xmm0, %xmm2
3049; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero
3050; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero
3051; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3052; SSE42-NEXT:    paddb (%rdx), %xmm2
3053; SSE42-NEXT:    paddb 32(%rdx), %xmm1
3054; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
3055; SSE42-NEXT:    movdqa %xmm2, (%rcx)
3056; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3057; SSE42-NEXT:    retq
3058;
3059; AVX-LABEL: vec384_v48i8_to_v8i48_factor6:
3060; AVX:       # %bb.0:
3061; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3062; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3063; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero
3064; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero
3065; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3066; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3067; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3068; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3069; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3070; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3071; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3072; AVX-NEXT:    retq
3073;
3074; AVX2-LABEL: vec384_v48i8_to_v8i48_factor6:
3075; AVX2:       # %bb.0:
3076; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3077; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3078; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3079; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u]
3080; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
3081; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3082; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3083; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3084; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
3085; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3086; AVX2-NEXT:    vzeroupper
3087; AVX2-NEXT:    retq
3088;
3089; AVX512F-LABEL: vec384_v48i8_to_v8i48_factor6:
3090; AVX512F:       # %bb.0:
3091; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3092; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3093; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3094; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u]
3095; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
3096; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3097; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3098; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3099; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
3100; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3101; AVX512F-NEXT:    vzeroupper
3102; AVX512F-NEXT:    retq
3103;
3104; AVX512BW-LABEL: vec384_v48i8_to_v8i48_factor6:
3105; AVX512BW:       # %bb.0:
3106; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
3107; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3108; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3109; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5]
3110; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
3111; AVX512BW-NEXT:    vpermw %ymm1, %ymm2, %ymm1
3112; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3113; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3114; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3115; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3116; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3117; AVX512BW-NEXT:    vzeroupper
3118; AVX512BW-NEXT:    retq
3119  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3120  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3121  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3122  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3123  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 1, i32 55, i32 56, i32 57, i32 58, i32 59, i32 2, i32 61, i32 62, i32 63, i32 64, i32 65, i32 3, i32 67, i32 68, i32 69, i32 70, i32 71, i32 4, i32 73, i32 74, i32 75, i32 76, i32 77, i32 5, i32 79, i32 80, i32 81, i32 82, i32 83, i32 6, i32 85, i32 86, i32 87, i32 88, i32 89, i32 7, i32 91, i32 92, i32 93, i32 94, i32 95>
3124  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3125  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3126  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3127  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3128  ret void
3129}
3130
3131define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3132; SSE2-LABEL: vec384_v48i8_to_v6i64_factor8:
3133; SSE2:       # %bb.0:
3134; SSE2-NEXT:    movdqa (%rdi), %xmm0
3135; SSE2-NEXT:    paddb (%rsi), %xmm0
3136; SSE2-NEXT:    pxor %xmm1, %xmm1
3137; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3138; SSE2-NEXT:    movdqa %xmm0, %xmm2
3139; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3140; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3141; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3142; SSE2-NEXT:    movdqa %xmm0, %xmm3
3143; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3144; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3145; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3146; SSE2-NEXT:    paddb (%rdx), %xmm3
3147; SSE2-NEXT:    paddb 32(%rdx), %xmm2
3148; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
3149; SSE2-NEXT:    movdqa %xmm3, (%rcx)
3150; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3151; SSE2-NEXT:    retq
3152;
3153; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8:
3154; SSE42:       # %bb.0:
3155; SSE42-NEXT:    movdqa (%rdi), %xmm0
3156; SSE42-NEXT:    paddb (%rsi), %xmm0
3157; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3158; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
3159; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
3160; SSE42-NEXT:    psrld $16, %xmm0
3161; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3162; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3163; SSE42-NEXT:    paddb 32(%rdx), %xmm2
3164; SSE42-NEXT:    paddb (%rdx), %xmm1
3165; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3166; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
3167; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3168; SSE42-NEXT:    retq
3169;
3170; AVX-LABEL: vec384_v48i8_to_v6i64_factor8:
3171; AVX:       # %bb.0:
3172; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3173; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3174; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3175; AVX-NEXT:    vpsrld $16, %xmm0, %xmm2
3176; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
3177; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3178; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3179; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3180; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3181; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3182; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3183; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3184; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3185; AVX-NEXT:    retq
3186;
3187; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8:
3188; AVX2-SLOW:       # %bb.0:
3189; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
3190; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3191; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3192; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3193; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3194; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3195; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3196; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
3197; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
3198; AVX2-SLOW-NEXT:    vzeroupper
3199; AVX2-SLOW-NEXT:    retq
3200;
3201; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8:
3202; AVX2-FAST-PERLANE:       # %bb.0:
3203; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
3204; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3205; AVX2-FAST-PERLANE-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3206; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
3207; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3208; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3209; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
3210; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
3211; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3212; AVX2-FAST-PERLANE-NEXT:    retq
3213;
3214; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8:
3215; AVX2-FAST:       # %bb.0:
3216; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
3217; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3218; AVX2-FAST-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3219; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
3220; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3221; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3222; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
3223; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
3224; AVX2-FAST-NEXT:    vzeroupper
3225; AVX2-FAST-NEXT:    retq
3226;
3227; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8:
3228; AVX512F:       # %bb.0:
3229; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3230; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3231; AVX512F-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3232; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
3233; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3234; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3235; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3236; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3237; AVX512F-NEXT:    vzeroupper
3238; AVX512F-NEXT:    retq
3239;
3240; AVX512BW-LABEL: vec384_v48i8_to_v6i64_factor8:
3241; AVX512BW:       # %bb.0:
3242; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
3243; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3244; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
3245; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3246; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3247; AVX512BW-NEXT:    vzeroupper
3248; AVX512BW-NEXT:    retq
3249  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3250  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3251  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3252  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3253  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 1, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 2, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 3, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 4, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 5, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3254  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3255  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3256  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3257  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3258  ret void
3259}
3260
3261define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3262; SSE2-LABEL: vec384_v48i8_to_v4i96_factor12:
3263; SSE2:       # %bb.0:
3264; SSE2-NEXT:    movdqa (%rdi), %xmm0
3265; SSE2-NEXT:    paddb (%rsi), %xmm0
3266; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3267; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3268; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
3269; SSE2-NEXT:    movdqa %xmm0, %xmm2
3270; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3271; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3272; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
3273; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3274; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
3275; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3276; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3277; SSE2-NEXT:    paddb (%rdx), %xmm0
3278; SSE2-NEXT:    paddb 16(%rdx), %xmm2
3279; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3280; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3281; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
3282; SSE2-NEXT:    movdqa %xmm0, (%rcx)
3283; SSE2-NEXT:    retq
3284;
3285; SSE42-LABEL: vec384_v48i8_to_v4i96_factor12:
3286; SSE42:       # %bb.0:
3287; SSE42-NEXT:    movdqa (%rdi), %xmm0
3288; SSE42-NEXT:    paddb (%rsi), %xmm0
3289; SSE42-NEXT:    movdqa %xmm0, %xmm1
3290; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3291; SSE42-NEXT:    movdqa %xmm0, %xmm2
3292; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero
3293; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero
3294; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3295; SSE42-NEXT:    paddb (%rdx), %xmm2
3296; SSE42-NEXT:    paddb 32(%rdx), %xmm1
3297; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
3298; SSE42-NEXT:    movdqa %xmm2, (%rcx)
3299; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3300; SSE42-NEXT:    retq
3301;
3302; AVX-LABEL: vec384_v48i8_to_v4i96_factor12:
3303; AVX:       # %bb.0:
3304; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3305; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3306; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero
3307; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero
3308; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3309; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3310; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3311; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3312; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3313; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3314; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3315; AVX-NEXT:    retq
3316;
3317; AVX2-LABEL: vec384_v48i8_to_v4i96_factor12:
3318; AVX2:       # %bb.0:
3319; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3320; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3321; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3322; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3323; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3324; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3325; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3326; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3327; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
3328; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3329; AVX2-NEXT:    vzeroupper
3330; AVX2-NEXT:    retq
3331;
3332; AVX512F-LABEL: vec384_v48i8_to_v4i96_factor12:
3333; AVX512F:       # %bb.0:
3334; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3335; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3336; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3337; AVX512F-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3338; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3339; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3340; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3341; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3342; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
3343; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3344; AVX512F-NEXT:    vzeroupper
3345; AVX512F-NEXT:    retq
3346;
3347; AVX512BW-LABEL: vec384_v48i8_to_v4i96_factor12:
3348; AVX512BW:       # %bb.0:
3349; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
3350; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3351; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3352; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3353; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3354; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3355; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3356; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3357; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3358; AVX512BW-NEXT:    vzeroupper
3359; AVX512BW-NEXT:    retq
3360  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3361  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3362  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3363  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3364  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 1, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 2, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 3, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3365  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3366  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3367  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3368  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3369  ret void
3370}
3371
3372define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3373; SSE2-LABEL: vec384_v48i8_to_v3i128_factor16:
3374; SSE2:       # %bb.0:
3375; SSE2-NEXT:    movdqa (%rdi), %xmm0
3376; SSE2-NEXT:    paddb (%rsi), %xmm0
3377; SSE2-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
3378; SSE2-NEXT:    pand %xmm0, %xmm1
3379; SSE2-NEXT:    movdqa %xmm0, %xmm2
3380; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3381; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3382; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3383; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3384; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3385; SSE2-NEXT:    paddb 32(%rdx), %xmm2
3386; SSE2-NEXT:    paddb (%rdx), %xmm1
3387; SSE2-NEXT:    movdqa %xmm1, (%rcx)
3388; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
3389; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3390; SSE2-NEXT:    retq
3391;
3392; SSE42-LABEL: vec384_v48i8_to_v3i128_factor16:
3393; SSE42:       # %bb.0:
3394; SSE42-NEXT:    movdqa (%rdi), %xmm0
3395; SSE42-NEXT:    paddb (%rsi), %xmm0
3396; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = [255,0]
3397; SSE42-NEXT:    pand %xmm0, %xmm1
3398; SSE42-NEXT:    movdqa %xmm0, %xmm2
3399; SSE42-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3400; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3401; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3402; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3403; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3404; SSE42-NEXT:    paddb 32(%rdx), %xmm2
3405; SSE42-NEXT:    paddb (%rdx), %xmm1
3406; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3407; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
3408; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3409; SSE42-NEXT:    retq
3410;
3411; AVX-LABEL: vec384_v48i8_to_v3i128_factor16:
3412; AVX:       # %bb.0:
3413; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3414; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3415; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3416; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3417; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3418; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
3419; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3420; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3421; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3422; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3423; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3424; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3425; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3426; AVX-NEXT:    retq
3427;
3428; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16:
3429; AVX2-SLOW:       # %bb.0:
3430; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
3431; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3432; AVX2-SLOW-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
3433; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3434; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3435; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3436; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3437; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3438; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3439; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
3440; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
3441; AVX2-SLOW-NEXT:    vzeroupper
3442; AVX2-SLOW-NEXT:    retq
3443;
3444; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16:
3445; AVX2-FAST-PERLANE:       # %bb.0:
3446; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
3447; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3448; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3449; AVX2-FAST-PERLANE-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3450; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3451; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3452; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3453; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3454; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
3455; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
3456; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3457; AVX2-FAST-PERLANE-NEXT:    retq
3458;
3459; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16:
3460; AVX2-FAST:       # %bb.0:
3461; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
3462; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3463; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3464; AVX2-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3465; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3466; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3467; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3468; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3469; AVX2-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
3470; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
3471; AVX2-FAST-NEXT:    vzeroupper
3472; AVX2-FAST-NEXT:    retq
3473;
3474; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16:
3475; AVX512F:       # %bb.0:
3476; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3477; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3478; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3479; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3480; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3481; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3482; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3483; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3484; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
3485; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3486; AVX512F-NEXT:    vzeroupper
3487; AVX512F-NEXT:    retq
3488;
3489; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16:
3490; AVX512BW:       # %bb.0:
3491; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
3492; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3493; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3494; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
3495; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3496; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3497; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3498; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3499; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3500; AVX512BW-NEXT:    vzeroupper
3501; AVX512BW-NEXT:    retq
3502  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3503  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3504  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3505  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3506  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 1, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3507  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3508  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3509  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3510  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3511  ret void
3512}
3513
3514define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3515; SSE2-LABEL: vec384_v48i8_to_v2i192_factor24:
3516; SSE2:       # %bb.0:
3517; SSE2-NEXT:    movdqa (%rdi), %xmm0
3518; SSE2-NEXT:    paddb (%rsi), %xmm0
3519; SSE2-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
3520; SSE2-NEXT:    pand %xmm0, %xmm1
3521; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3522; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3523; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3524; SSE2-NEXT:    movaps 32(%rdx), %xmm2
3525; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3526; SSE2-NEXT:    paddb (%rdx), %xmm1
3527; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
3528; SSE2-NEXT:    movdqa %xmm1, (%rcx)
3529; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3530; SSE2-NEXT:    retq
3531;
3532; SSE42-LABEL: vec384_v48i8_to_v2i192_factor24:
3533; SSE42:       # %bb.0:
3534; SSE42-NEXT:    movdqa (%rdi), %xmm0
3535; SSE42-NEXT:    paddb (%rsi), %xmm0
3536; SSE42-NEXT:    movdqa %xmm0, %xmm1
3537; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
3538; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3539; SSE42-NEXT:    movaps 32(%rdx), %xmm2
3540; SSE42-NEXT:    paddb (%rdx), %xmm0
3541; SSE42-NEXT:    paddb 16(%rdx), %xmm1
3542; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
3543; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
3544; SSE42-NEXT:    movdqa %xmm0, (%rcx)
3545; SSE42-NEXT:    retq
3546;
3547; AVX-LABEL: vec384_v48i8_to_v2i192_factor24:
3548; AVX:       # %bb.0:
3549; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3550; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3551; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3552; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3553; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
3554; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3555; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3556; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
3557; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3558; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3559; AVX-NEXT:    vzeroupper
3560; AVX-NEXT:    retq
3561;
3562; AVX2-LABEL: vec384_v48i8_to_v2i192_factor24:
3563; AVX2:       # %bb.0:
3564; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3565; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3566; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3567; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3568; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3569; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
3570; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3571; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
3572; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3573; AVX2-NEXT:    vzeroupper
3574; AVX2-NEXT:    retq
3575;
3576; AVX512F-LABEL: vec384_v48i8_to_v2i192_factor24:
3577; AVX512F:       # %bb.0:
3578; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3579; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3580; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3581; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3582; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3583; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3584; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
3585; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
3586; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3587; AVX512F-NEXT:    vzeroupper
3588; AVX512F-NEXT:    retq
3589;
3590; AVX512BW-LABEL: vec384_v48i8_to_v2i192_factor24:
3591; AVX512BW:       # %bb.0:
3592; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
3593; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3594; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3595; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3596; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3597; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3598; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3599; AVX512BW-NEXT:    vzeroupper
3600; AVX512BW-NEXT:    retq
3601  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3602  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3603  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3604  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3605  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 1, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3606  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3607  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3608  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3609  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3610  ret void
3611}
3612
3613define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3614; SSE-LABEL: vec384_v48i8_to_v1i384_factor48:
3615; SSE:       # %bb.0:
3616; SSE-NEXT:    movdqa (%rdi), %xmm0
3617; SSE-NEXT:    paddb (%rsi), %xmm0
3618; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3619; SSE-NEXT:    movaps 16(%rdx), %xmm1
3620; SSE-NEXT:    movaps 32(%rdx), %xmm2
3621; SSE-NEXT:    paddb (%rdx), %xmm0
3622; SSE-NEXT:    movaps %xmm1, 16(%rcx)
3623; SSE-NEXT:    movaps %xmm2, 32(%rcx)
3624; SSE-NEXT:    movdqa %xmm0, (%rcx)
3625; SSE-NEXT:    retq
3626;
3627; AVX-LABEL: vec384_v48i8_to_v1i384_factor48:
3628; AVX:       # %bb.0:
3629; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3630; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3631; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
3632; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3633; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
3634; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
3635; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
3636; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
3637; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
3638; AVX-NEXT:    vzeroupper
3639; AVX-NEXT:    retq
3640;
3641; AVX2-LABEL: vec384_v48i8_to_v1i384_factor48:
3642; AVX2:       # %bb.0:
3643; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3644; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3645; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
3646; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
3647; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
3648; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3649; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
3650; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3651; AVX2-NEXT:    vzeroupper
3652; AVX2-NEXT:    retq
3653;
3654; AVX512F-LABEL: vec384_v48i8_to_v1i384_factor48:
3655; AVX512F:       # %bb.0:
3656; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
3657; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3658; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
3659; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
3660; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3661; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
3662; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
3663; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3664; AVX512F-NEXT:    vzeroupper
3665; AVX512F-NEXT:    retq
3666;
3667; AVX512BW-LABEL: vec384_v48i8_to_v1i384_factor48:
3668; AVX512BW:       # %bb.0:
3669; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3670; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3671; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
3672; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3673; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3674; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3675; AVX512BW-NEXT:    vzeroupper
3676; AVX512BW-NEXT:    retq
3677  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3678  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3679  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3680  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3681  %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3682  %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3683  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3684  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3685  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3686  ret void
3687}
3688
3689define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3690; SSE2-LABEL: vec384_v24i16_to_v12i32_factor2:
3691; SSE2:       # %bb.0:
3692; SSE2-NEXT:    movdqa (%rdi), %xmm0
3693; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
3694; SSE2-NEXT:    paddb (%rsi), %xmm0
3695; SSE2-NEXT:    paddb 16(%rsi), %xmm1
3696; SSE2-NEXT:    pxor %xmm2, %xmm2
3697; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3698; SSE2-NEXT:    movdqa %xmm0, %xmm3
3699; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3700; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3701; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3702; SSE2-NEXT:    paddb (%rdx), %xmm3
3703; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3704; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3705; SSE2-NEXT:    movdqa %xmm3, (%rcx)
3706; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3707; SSE2-NEXT:    retq
3708;
3709; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2:
3710; SSE42:       # %bb.0:
3711; SSE42-NEXT:    movdqa (%rdi), %xmm0
3712; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
3713; SSE42-NEXT:    paddb (%rsi), %xmm0
3714; SSE42-NEXT:    paddb 16(%rsi), %xmm1
3715; SSE42-NEXT:    pxor %xmm2, %xmm2
3716; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
3717; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3718; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3719; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3720; SSE42-NEXT:    paddb (%rdx), %xmm3
3721; SSE42-NEXT:    paddb 32(%rdx), %xmm1
3722; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
3723; SSE42-NEXT:    movdqa %xmm3, (%rcx)
3724; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3725; SSE42-NEXT:    retq
3726;
3727; AVX-LABEL: vec384_v24i16_to_v12i32_factor2:
3728; AVX:       # %bb.0:
3729; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3730; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
3731; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
3732; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3733; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3734; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3735; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3736; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
3737; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
3738; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3739; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
3740; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
3741; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3742; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
3743; AVX-NEXT:    retq
3744;
3745; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2:
3746; AVX2:       # %bb.0:
3747; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3748; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3749; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3750; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
3751; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3752; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3753; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3754; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3755; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3756; AVX2-NEXT:    vzeroupper
3757; AVX2-NEXT:    retq
3758;
3759; AVX512F-LABEL: vec384_v24i16_to_v12i32_factor2:
3760; AVX512F:       # %bb.0:
3761; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
3762; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3763; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3764; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
3765; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3766; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3767; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3768; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3769; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3770; AVX512F-NEXT:    vzeroupper
3771; AVX512F-NEXT:    retq
3772;
3773; AVX512BW-LABEL: vec384_v24i16_to_v12i32_factor2:
3774; AVX512BW:       # %bb.0:
3775; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
3776; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3777; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3778; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
3779; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3780; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3781; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3782; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3783; AVX512BW-NEXT:    vzeroupper
3784; AVX512BW-NEXT:    retq
3785  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3786  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3787  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3788  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3789  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
3790  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 1, i32 27, i32 2, i32 29, i32 3, i32 31, i32 4, i32 33, i32 5, i32 35, i32 6, i32 37, i32 7, i32 39, i32 8, i32 41, i32 9, i32 43, i32 10, i32 45, i32 11, i32 47>
3791  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
3792  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3793  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3794  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3795  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3796  ret void
3797}
3798
3799define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3800; SSE2-LABEL: vec384_v24i16_to_v8i48_factor3:
3801; SSE2:       # %bb.0:
3802; SSE2-NEXT:    movdqa (%rdi), %xmm0
3803; SSE2-NEXT:    paddb (%rsi), %xmm0
3804; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3805; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3806; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3807; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
3808; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3809; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3810; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3811; SSE2-NEXT:    paddb (%rdx), %xmm2
3812; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3813; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3814; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3815; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3816; SSE2-NEXT:    retq
3817;
3818; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3:
3819; SSE42:       # %bb.0:
3820; SSE42-NEXT:    movdqa (%rdi), %xmm0
3821; SSE42-NEXT:    paddb (%rsi), %xmm0
3822; SSE42-NEXT:    pxor %xmm1, %xmm1
3823; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
3824; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
3825; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
3826; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7]
3827; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3828; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
3829; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3830; SSE42-NEXT:    paddb (%rdx), %xmm3
3831; SSE42-NEXT:    paddb 32(%rdx), %xmm2
3832; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
3833; SSE42-NEXT:    movdqa %xmm3, (%rcx)
3834; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3835; SSE42-NEXT:    retq
3836;
3837; AVX-LABEL: vec384_v24i16_to_v8i48_factor3:
3838; AVX:       # %bb.0:
3839; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3840; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3841; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3842; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3843; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
3844; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
3845; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
3846; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3847; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3848; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3849; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm2
3850; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3851; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3852; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3853; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3854; AVX-NEXT:    retq
3855;
3856; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3:
3857; AVX2-SLOW:       # %bb.0:
3858; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
3859; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3860; AVX2-SLOW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3861; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm1, %ymm1
3862; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3863; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3864; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3865; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3866; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3867; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3868; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
3869; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
3870; AVX2-SLOW-NEXT:    vzeroupper
3871; AVX2-SLOW-NEXT:    retq
3872;
3873; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v8i48_factor3:
3874; AVX2-FAST-PERLANE:       # %bb.0:
3875; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
3876; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3877; AVX2-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3878; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm1, %ymm1
3879; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3880; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3881; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3882; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3883; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
3884; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
3885; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3886; AVX2-FAST-PERLANE-NEXT:    retq
3887;
3888; AVX2-FAST-LABEL: vec384_v24i16_to_v8i48_factor3:
3889; AVX2-FAST:       # %bb.0:
3890; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
3891; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3892; AVX2-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3893; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
3894; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3895; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3896; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3897; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3898; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
3899; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
3900; AVX2-FAST-NEXT:    vzeroupper
3901; AVX2-FAST-NEXT:    retq
3902;
3903; AVX512F-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3:
3904; AVX512F-SLOW:       # %bb.0:
3905; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
3906; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3907; AVX512F-SLOW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3908; AVX512F-SLOW-NEXT:    vpermd %ymm0, %ymm1, %ymm1
3909; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3910; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3911; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3912; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3913; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3914; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3915; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
3916; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
3917; AVX512F-SLOW-NEXT:    vzeroupper
3918; AVX512F-SLOW-NEXT:    retq
3919;
3920; AVX512F-FAST-LABEL: vec384_v24i16_to_v8i48_factor3:
3921; AVX512F-FAST:       # %bb.0:
3922; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
3923; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3924; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3925; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
3926; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3927; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3928; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3929; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3930; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
3931; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
3932; AVX512F-FAST-NEXT:    vzeroupper
3933; AVX512F-FAST-NEXT:    retq
3934;
3935; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3:
3936; AVX512BW-SLOW:       # %bb.0:
3937; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
3938; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3939; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21]
3940; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3941; AVX512BW-SLOW-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
3942; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3943; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3944; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
3945; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
3946; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3947; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3948; AVX512BW-SLOW-NEXT:    vzeroupper
3949; AVX512BW-SLOW-NEXT:    retq
3950;
3951; AVX512BW-FAST-LABEL: vec384_v24i16_to_v8i48_factor3:
3952; AVX512BW-FAST:       # %bb.0:
3953; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
3954; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3955; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21]
3956; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3957; AVX512BW-FAST-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
3958; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3959; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
3960; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3961; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
3962; AVX512BW-FAST-NEXT:    vzeroupper
3963; AVX512BW-FAST-NEXT:    retq
3964  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3965  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3966  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3967  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3968  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
3969  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 1, i32 28, i32 29, i32 2, i32 31, i32 32, i32 3, i32 34, i32 35, i32 4, i32 37, i32 38, i32 5, i32 40, i32 41, i32 6, i32 43, i32 44, i32 7, i32 46, i32 47>
3970  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
3971  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3972  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3973  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3974  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3975  ret void
3976}
3977
3978define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3979; SSE2-LABEL: vec384_v24i16_to_v6i64_factor4:
3980; SSE2:       # %bb.0:
3981; SSE2-NEXT:    movdqa (%rdi), %xmm0
3982; SSE2-NEXT:    paddb (%rsi), %xmm0
3983; SSE2-NEXT:    pxor %xmm1, %xmm1
3984; SSE2-NEXT:    movdqa %xmm0, %xmm2
3985; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3986; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3987; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3988; SSE2-NEXT:    movdqa %xmm0, %xmm3
3989; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3990; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3991; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3992; SSE2-NEXT:    paddb (%rdx), %xmm3
3993; SSE2-NEXT:    paddb 32(%rdx), %xmm2
3994; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
3995; SSE2-NEXT:    movdqa %xmm3, (%rcx)
3996; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3997; SSE2-NEXT:    retq
3998;
3999; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4:
4000; SSE42:       # %bb.0:
4001; SSE42-NEXT:    movdqa (%rdi), %xmm0
4002; SSE42-NEXT:    paddb (%rsi), %xmm0
4003; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4004; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
4005; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4006; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4007; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4008; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4009; SSE42-NEXT:    paddb 32(%rdx), %xmm2
4010; SSE42-NEXT:    paddb (%rdx), %xmm1
4011; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4012; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
4013; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4014; SSE42-NEXT:    retq
4015;
4016; AVX-LABEL: vec384_v24i16_to_v6i64_factor4:
4017; AVX:       # %bb.0:
4018; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4019; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4020; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4021; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
4022; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4023; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4024; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4025; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4026; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4027; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4028; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4029; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
4030; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4031; AVX-NEXT:    retq
4032;
4033; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4:
4034; AVX2-SLOW:       # %bb.0:
4035; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4036; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4037; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4038; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4039; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4040; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4041; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4042; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4043; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4044; AVX2-SLOW-NEXT:    vzeroupper
4045; AVX2-SLOW-NEXT:    retq
4046;
4047; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4:
4048; AVX2-FAST-PERLANE:       # %bb.0:
4049; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
4050; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4051; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4052; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4053; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4054; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4055; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
4056; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
4057; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4058; AVX2-FAST-PERLANE-NEXT:    retq
4059;
4060; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4:
4061; AVX2-FAST:       # %bb.0:
4062; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4063; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4064; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4065; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4066; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4067; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4068; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4069; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4070; AVX2-FAST-NEXT:    vzeroupper
4071; AVX2-FAST-NEXT:    retq
4072;
4073; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4:
4074; AVX512F:       # %bb.0:
4075; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
4076; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4077; AVX512F-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4078; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4079; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4080; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4081; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
4082; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
4083; AVX512F-NEXT:    vzeroupper
4084; AVX512F-NEXT:    retq
4085;
4086; AVX512BW-LABEL: vec384_v24i16_to_v6i64_factor4:
4087; AVX512BW:       # %bb.0:
4088; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
4089; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4090; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4091; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4092; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4093; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4094; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4095; AVX512BW-NEXT:    vzeroupper
4096; AVX512BW-NEXT:    retq
4097  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4098  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4099  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4100  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4101  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4102  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 1, i32 29, i32 30, i32 31, i32 2, i32 33, i32 34, i32 35, i32 3, i32 37, i32 38, i32 39, i32 4, i32 41, i32 42, i32 43, i32 5, i32 45, i32 46, i32 47>
4103  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4104  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4105  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4106  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4107  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4108  ret void
4109}
4110
4111define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4112; SSE2-LABEL: vec384_v24i16_to_v4i96_factor6:
4113; SSE2:       # %bb.0:
4114; SSE2-NEXT:    movdqa (%rdi), %xmm0
4115; SSE2-NEXT:    paddb (%rsi), %xmm0
4116; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
4117; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4118; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
4119; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7]
4120; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
4121; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
4122; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4123; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4124; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4125; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4126; SSE2-NEXT:    paddb (%rdx), %xmm2
4127; SSE2-NEXT:    paddb 32(%rdx), %xmm1
4128; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
4129; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4130; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4131; SSE2-NEXT:    retq
4132;
4133; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6:
4134; SSE42:       # %bb.0:
4135; SSE42-NEXT:    movdqa (%rdi), %xmm0
4136; SSE42-NEXT:    paddb (%rsi), %xmm0
4137; SSE42-NEXT:    pxor %xmm1, %xmm1
4138; SSE42-NEXT:    movdqa %xmm0, %xmm2
4139; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,3],zero,zero
4140; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
4141; SSE42-NEXT:    psrld $16, %xmm0
4142; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
4143; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7]
4144; SSE42-NEXT:    paddb 16(%rdx), %xmm3
4145; SSE42-NEXT:    paddb 32(%rdx), %xmm0
4146; SSE42-NEXT:    paddb (%rdx), %xmm2
4147; SSE42-NEXT:    movdqa %xmm2, (%rcx)
4148; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
4149; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
4150; SSE42-NEXT:    retq
4151;
4152; AVX-LABEL: vec384_v24i16_to_v4i96_factor6:
4153; AVX:       # %bb.0:
4154; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4155; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4156; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero
4157; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
4158; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4159; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7]
4160; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
4161; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
4162; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4163; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4164; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4165; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4166; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
4167; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4168; AVX-NEXT:    retq
4169;
4170; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6:
4171; AVX2-SLOW:       # %bb.0:
4172; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4173; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4174; AVX2-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
4175; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4176; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
4177; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4178; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4179; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4180; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4181; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4182; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
4183; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
4184; AVX2-SLOW-NEXT:    vzeroupper
4185; AVX2-SLOW-NEXT:    retq
4186;
4187; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v4i96_factor6:
4188; AVX2-FAST-PERLANE:       # %bb.0:
4189; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
4190; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4191; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4192; AVX2-FAST-PERLANE-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4193; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4194; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4195; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4196; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4197; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
4198; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
4199; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4200; AVX2-FAST-PERLANE-NEXT:    retq
4201;
4202; AVX2-FAST-LABEL: vec384_v24i16_to_v4i96_factor6:
4203; AVX2-FAST:       # %bb.0:
4204; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4205; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4206; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4207; AVX2-FAST-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4208; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4209; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4210; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4211; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4212; AVX2-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
4213; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
4214; AVX2-FAST-NEXT:    vzeroupper
4215; AVX2-FAST-NEXT:    retq
4216;
4217; AVX512F-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6:
4218; AVX512F-SLOW:       # %bb.0:
4219; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4220; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4221; AVX512F-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
4222; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4223; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
4224; AVX512F-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4225; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4226; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4227; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4228; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4229; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
4230; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
4231; AVX512F-SLOW-NEXT:    vzeroupper
4232; AVX512F-SLOW-NEXT:    retq
4233;
4234; AVX512F-FAST-LABEL: vec384_v24i16_to_v4i96_factor6:
4235; AVX512F-FAST:       # %bb.0:
4236; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4237; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4238; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4239; AVX512F-FAST-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4240; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4241; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4242; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4243; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4244; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
4245; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
4246; AVX512F-FAST-NEXT:    vzeroupper
4247; AVX512F-FAST-NEXT:    retq
4248;
4249; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6:
4250; AVX512BW-SLOW:       # %bb.0:
4251; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
4252; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4253; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15]
4254; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4255; AVX512BW-SLOW-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
4256; AVX512BW-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
4257; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4258; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
4259; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4260; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4261; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4262; AVX512BW-SLOW-NEXT:    vzeroupper
4263; AVX512BW-SLOW-NEXT:    retq
4264;
4265; AVX512BW-FAST-LABEL: vec384_v24i16_to_v4i96_factor6:
4266; AVX512BW-FAST:       # %bb.0:
4267; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
4268; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4269; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15]
4270; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4271; AVX512BW-FAST-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
4272; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4273; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4274; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4275; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4276; AVX512BW-FAST-NEXT:    vzeroupper
4277; AVX512BW-FAST-NEXT:    retq
4278  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4279  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4280  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4281  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4282  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4283  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 1, i32 31, i32 32, i32 33, i32 34, i32 35, i32 2, i32 37, i32 38, i32 39, i32 40, i32 41, i32 3, i32 43, i32 44, i32 45, i32 46, i32 47>
4284  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4285  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4286  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4287  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4288  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4289  ret void
4290}
4291
4292define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4293; SSE2-LABEL: vec384_v24i16_to_v3i128_factor8:
4294; SSE2:       # %bb.0:
4295; SSE2-NEXT:    movdqa (%rdi), %xmm0
4296; SSE2-NEXT:    paddb (%rsi), %xmm0
4297; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
4298; SSE2-NEXT:    pand %xmm0, %xmm1
4299; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4300; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4301; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4302; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4303; SSE2-NEXT:    paddb 16(%rdx), %xmm2
4304; SSE2-NEXT:    paddb 32(%rdx), %xmm0
4305; SSE2-NEXT:    paddb (%rdx), %xmm1
4306; SSE2-NEXT:    movdqa %xmm1, (%rcx)
4307; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
4308; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
4309; SSE2-NEXT:    retq
4310;
4311; SSE42-LABEL: vec384_v24i16_to_v3i128_factor8:
4312; SSE42:       # %bb.0:
4313; SSE42-NEXT:    movdqa (%rdi), %xmm0
4314; SSE42-NEXT:    paddb (%rsi), %xmm0
4315; SSE42-NEXT:    pxor %xmm1, %xmm1
4316; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4317; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4318; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4319; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4320; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4321; SSE42-NEXT:    paddb 16(%rdx), %xmm2
4322; SSE42-NEXT:    paddb 32(%rdx), %xmm0
4323; SSE42-NEXT:    paddb (%rdx), %xmm1
4324; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4325; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
4326; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
4327; SSE42-NEXT:    retq
4328;
4329; AVX-LABEL: vec384_v24i16_to_v3i128_factor8:
4330; AVX:       # %bb.0:
4331; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4332; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4333; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4334; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4335; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4336; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4337; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4338; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4339; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4340; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4341; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4342; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4343; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
4344; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4345; AVX-NEXT:    retq
4346;
4347; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
4348; AVX2-SLOW:       # %bb.0:
4349; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
4350; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
4351; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
4352; AVX2-SLOW-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4353; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4354; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4355; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4356; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4357; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4358; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
4359; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
4360; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
4361; AVX2-SLOW-NEXT:    vzeroupper
4362; AVX2-SLOW-NEXT:    retq
4363;
4364; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8:
4365; AVX2-FAST-PERLANE:       # %bb.0:
4366; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
4367; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
4368; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
4369; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4370; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4371; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4372; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4373; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4374; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
4375; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
4376; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
4377; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4378; AVX2-FAST-PERLANE-NEXT:    retq
4379;
4380; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
4381; AVX2-FAST:       # %bb.0:
4382; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
4383; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm1
4384; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
4385; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4386; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4387; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4388; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4389; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4390; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
4391; AVX2-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
4392; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
4393; AVX2-FAST-NEXT:    vzeroupper
4394; AVX2-FAST-NEXT:    retq
4395;
4396; AVX512F-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
4397; AVX512F-SLOW:       # %bb.0:
4398; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4399; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4400; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
4401; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4402; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
4403; AVX512F-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4404; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
4405; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4406; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
4407; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4408; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4409; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
4410; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
4411; AVX512F-SLOW-NEXT:    vzeroupper
4412; AVX512F-SLOW-NEXT:    retq
4413;
4414; AVX512F-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
4415; AVX512F-FAST:       # %bb.0:
4416; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4417; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4418; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4419; AVX512F-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4420; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
4421; AVX512F-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4422; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
4423; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4424; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4425; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
4426; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
4427; AVX512F-FAST-NEXT:    vzeroupper
4428; AVX512F-FAST-NEXT:    retq
4429;
4430; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
4431; AVX512BW-SLOW:       # %bb.0:
4432; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
4433; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4434; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
4435; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4436; AVX512BW-SLOW-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
4437; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4438; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4439; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4440; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4441; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4442; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4443; AVX512BW-SLOW-NEXT:    vzeroupper
4444; AVX512BW-SLOW-NEXT:    retq
4445;
4446; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
4447; AVX512BW-FAST:       # %bb.0:
4448; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
4449; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4450; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
4451; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4452; AVX512BW-FAST-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
4453; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4454; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4455; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4456; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4457; AVX512BW-FAST-NEXT:    vzeroupper
4458; AVX512BW-FAST-NEXT:    retq
4459  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4460  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4461  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4462  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4463  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4464  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 1, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4465  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4466  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4467  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4468  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4469  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4470  ret void
4471}
4472
4473define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4474; SSE2-LABEL: vec384_v24i16_to_v2i192_factor12:
4475; SSE2:       # %bb.0:
4476; SSE2-NEXT:    movdqa (%rdi), %xmm0
4477; SSE2-NEXT:    paddb (%rsi), %xmm0
4478; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
4479; SSE2-NEXT:    pand %xmm0, %xmm1
4480; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4481; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4482; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4483; SSE2-NEXT:    movaps 32(%rdx), %xmm2
4484; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4485; SSE2-NEXT:    paddb (%rdx), %xmm1
4486; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
4487; SSE2-NEXT:    movdqa %xmm1, (%rcx)
4488; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4489; SSE2-NEXT:    retq
4490;
4491; SSE42-LABEL: vec384_v24i16_to_v2i192_factor12:
4492; SSE42:       # %bb.0:
4493; SSE42-NEXT:    movdqa (%rdi), %xmm0
4494; SSE42-NEXT:    paddb (%rsi), %xmm0
4495; SSE42-NEXT:    pxor %xmm1, %xmm1
4496; SSE42-NEXT:    pxor %xmm2, %xmm2
4497; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4498; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4499; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
4500; SSE42-NEXT:    movaps 32(%rdx), %xmm1
4501; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4502; SSE42-NEXT:    paddb (%rdx), %xmm2
4503; SSE42-NEXT:    movaps %xmm1, 32(%rcx)
4504; SSE42-NEXT:    movdqa %xmm2, (%rcx)
4505; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4506; SSE42-NEXT:    retq
4507;
4508; AVX-LABEL: vec384_v24i16_to_v2i192_factor12:
4509; AVX:       # %bb.0:
4510; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4511; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4512; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4513; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4514; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4515; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
4516; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
4517; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4518; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
4519; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
4520; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
4521; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4522; AVX-NEXT:    vzeroupper
4523; AVX-NEXT:    retq
4524;
4525; AVX2-LABEL: vec384_v24i16_to_v2i192_factor12:
4526; AVX2:       # %bb.0:
4527; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
4528; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4529; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4530; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4531; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4532; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
4533; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4534; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
4535; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4536; AVX2-NEXT:    vzeroupper
4537; AVX2-NEXT:    retq
4538;
4539; AVX512F-LABEL: vec384_v24i16_to_v2i192_factor12:
4540; AVX512F:       # %bb.0:
4541; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
4542; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4543; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4544; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4545; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4546; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4547; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
4548; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
4549; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4550; AVX512F-NEXT:    vzeroupper
4551; AVX512F-NEXT:    retq
4552;
4553; AVX512BW-LABEL: vec384_v24i16_to_v2i192_factor12:
4554; AVX512BW:       # %bb.0:
4555; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
4556; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4557; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15]
4558; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4559; AVX512BW-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
4560; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
4561; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4562; AVX512BW-NEXT:    vzeroupper
4563; AVX512BW-NEXT:    retq
4564  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4565  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4566  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4567  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4568  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4569  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4570  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4571  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4572  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4573  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4574  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4575  ret void
4576}
4577
4578define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4579; SSE2-LABEL: vec384_v24i16_to_v1i384_factor24:
4580; SSE2:       # %bb.0:
4581; SSE2-NEXT:    movdqa (%rdi), %xmm0
4582; SSE2-NEXT:    paddb (%rsi), %xmm0
4583; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4584; SSE2-NEXT:    movaps 16(%rdx), %xmm1
4585; SSE2-NEXT:    movaps 32(%rdx), %xmm2
4586; SSE2-NEXT:    paddb (%rdx), %xmm0
4587; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
4588; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
4589; SSE2-NEXT:    movdqa %xmm0, (%rcx)
4590; SSE2-NEXT:    retq
4591;
4592; SSE42-LABEL: vec384_v24i16_to_v1i384_factor24:
4593; SSE42:       # %bb.0:
4594; SSE42-NEXT:    movdqa (%rdi), %xmm0
4595; SSE42-NEXT:    paddb (%rsi), %xmm0
4596; SSE42-NEXT:    pxor %xmm1, %xmm1
4597; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4598; SSE42-NEXT:    movaps 16(%rdx), %xmm0
4599; SSE42-NEXT:    movaps 32(%rdx), %xmm2
4600; SSE42-NEXT:    paddb (%rdx), %xmm1
4601; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
4602; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
4603; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4604; SSE42-NEXT:    retq
4605;
4606; AVX-LABEL: vec384_v24i16_to_v1i384_factor24:
4607; AVX:       # %bb.0:
4608; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4609; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4610; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
4611; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4612; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4613; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
4614; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
4615; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
4616; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
4617; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
4618; AVX-NEXT:    vzeroupper
4619; AVX-NEXT:    retq
4620;
4621; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24:
4622; AVX2:       # %bb.0:
4623; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4624; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4625; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
4626; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
4627; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
4628; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4629; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
4630; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4631; AVX2-NEXT:    vzeroupper
4632; AVX2-NEXT:    retq
4633;
4634; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24:
4635; AVX512F:       # %bb.0:
4636; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4637; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4638; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
4639; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
4640; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4641; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
4642; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
4643; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4644; AVX512F-NEXT:    vzeroupper
4645; AVX512F-NEXT:    retq
4646;
4647; AVX512BW-LABEL: vec384_v24i16_to_v1i384_factor24:
4648; AVX512BW:       # %bb.0:
4649; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4650; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4651; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
4652; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4653; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4654; AVX512BW-NEXT:    vzeroupper
4655; AVX512BW-NEXT:    retq
4656  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4657  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4658  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4659  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4660  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4661  %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4662  %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4663  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4664  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4665  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4666  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4667  ret void
4668}
4669
4670define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4671; SSE2-LABEL: vec384_v12i32_to_v6i64_factor2:
4672; SSE2:       # %bb.0:
4673; SSE2-NEXT:    movdqa (%rdi), %xmm0
4674; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
4675; SSE2-NEXT:    paddb (%rsi), %xmm0
4676; SSE2-NEXT:    paddb 16(%rsi), %xmm1
4677; SSE2-NEXT:    pxor %xmm2, %xmm2
4678; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4679; SSE2-NEXT:    movdqa %xmm0, %xmm3
4680; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4681; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4682; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4683; SSE2-NEXT:    paddb (%rdx), %xmm3
4684; SSE2-NEXT:    paddb 32(%rdx), %xmm1
4685; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
4686; SSE2-NEXT:    movdqa %xmm3, (%rcx)
4687; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4688; SSE2-NEXT:    retq
4689;
4690; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2:
4691; SSE42:       # %bb.0:
4692; SSE42-NEXT:    movdqa (%rdi), %xmm0
4693; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
4694; SSE42-NEXT:    paddb (%rsi), %xmm0
4695; SSE42-NEXT:    paddb 16(%rsi), %xmm1
4696; SSE42-NEXT:    pxor %xmm2, %xmm2
4697; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
4698; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
4699; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4700; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4701; SSE42-NEXT:    paddb (%rdx), %xmm3
4702; SSE42-NEXT:    paddb 32(%rdx), %xmm1
4703; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
4704; SSE42-NEXT:    movdqa %xmm3, (%rcx)
4705; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4706; SSE42-NEXT:    retq
4707;
4708; AVX-LABEL: vec384_v12i32_to_v6i64_factor2:
4709; AVX:       # %bb.0:
4710; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4711; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
4712; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
4713; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4714; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
4715; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4716; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
4717; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
4718; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
4719; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4720; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
4721; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
4722; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4723; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
4724; AVX-NEXT:    retq
4725;
4726; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
4727; AVX2:       # %bb.0:
4728; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4729; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4730; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4731; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
4732; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4733; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4734; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4735; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
4736; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
4737; AVX2-NEXT:    vzeroupper
4738; AVX2-NEXT:    retq
4739;
4740; AVX512F-LABEL: vec384_v12i32_to_v6i64_factor2:
4741; AVX512F:       # %bb.0:
4742; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4743; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4744; AVX512F-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4745; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4746; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4747; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4748; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4749; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
4750; AVX512F-NEXT:    vzeroupper
4751; AVX512F-NEXT:    retq
4752;
4753; AVX512BW-LABEL: vec384_v12i32_to_v6i64_factor2:
4754; AVX512BW:       # %bb.0:
4755; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
4756; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4757; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4758; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4759; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4760; AVX512BW-NEXT:    vzeroupper
4761; AVX512BW-NEXT:    retq
4762  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4763  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4764  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4765  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4766  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
4767  %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 1, i32 15, i32 2, i32 17, i32 3, i32 19, i32 4, i32 21, i32 5, i32 23>
4768  %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
4769  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4770  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4771  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4772  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4773  ret void
4774}
4775
4776define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4777; SSE2-LABEL: vec384_v12i32_to_v4i96_factor3:
4778; SSE2:       # %bb.0:
4779; SSE2-NEXT:    movdqa (%rdi), %xmm0
4780; SSE2-NEXT:    paddb (%rsi), %xmm0
4781; SSE2-NEXT:    xorps %xmm1, %xmm1
4782; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,4294967295,0]
4783; SSE2-NEXT:    pand %xmm0, %xmm2
4784; SSE2-NEXT:    movdqa %xmm0, %xmm3
4785; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4786; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
4787; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4788; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
4789; SSE2-NEXT:    paddb (%rdx), %xmm0
4790; SSE2-NEXT:    paddb 32(%rdx), %xmm3
4791; SSE2-NEXT:    paddb 16(%rdx), %xmm2
4792; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
4793; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
4794; SSE2-NEXT:    movdqa %xmm0, (%rcx)
4795; SSE2-NEXT:    retq
4796;
4797; SSE42-LABEL: vec384_v12i32_to_v4i96_factor3:
4798; SSE42:       # %bb.0:
4799; SSE42-NEXT:    movdqa (%rdi), %xmm0
4800; SSE42-NEXT:    paddb (%rsi), %xmm0
4801; SSE42-NEXT:    pxor %xmm1, %xmm1
4802; SSE42-NEXT:    pxor %xmm2, %xmm2
4803; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
4804; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
4805; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
4806; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4807; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
4808; SSE42-NEXT:    paddb (%rdx), %xmm0
4809; SSE42-NEXT:    paddb 32(%rdx), %xmm3
4810; SSE42-NEXT:    paddb 16(%rdx), %xmm2
4811; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
4812; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
4813; SSE42-NEXT:    movdqa %xmm0, (%rcx)
4814; SSE42-NEXT:    retq
4815;
4816; AVX-LABEL: vec384_v12i32_to_v4i96_factor3:
4817; AVX:       # %bb.0:
4818; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4819; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4820; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
4821; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4822; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
4823; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4824; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4825; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
4826; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4827; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
4828; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4829; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4830; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4831; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4832; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4833; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
4834; AVX-NEXT:    vzeroupper
4835; AVX-NEXT:    retq
4836;
4837; AVX2-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3:
4838; AVX2-SLOW:       # %bb.0:
4839; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
4840; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4841; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4842; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
4843; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4844; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4845; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4846; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4847; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4848; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4849; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4850; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4851; AVX2-SLOW-NEXT:    vzeroupper
4852; AVX2-SLOW-NEXT:    retq
4853;
4854; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v4i96_factor3:
4855; AVX2-FAST-PERLANE:       # %bb.0:
4856; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
4857; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4858; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4859; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
4860; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4861; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4862; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4863; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4864; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
4865; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
4866; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4867; AVX2-FAST-PERLANE-NEXT:    retq
4868;
4869; AVX2-FAST-LABEL: vec384_v12i32_to_v4i96_factor3:
4870; AVX2-FAST:       # %bb.0:
4871; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
4872; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4873; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4874; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
4875; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4876; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4877; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4878; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4879; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4880; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4881; AVX2-FAST-NEXT:    vzeroupper
4882; AVX2-FAST-NEXT:    retq
4883;
4884; AVX512F-LABEL: vec384_v12i32_to_v4i96_factor3:
4885; AVX512F:       # %bb.0:
4886; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4887; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4888; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,0,0,0,0]
4889; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4890; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
4891; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
4892; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4893; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
4894; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
4895; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
4896; AVX512F-NEXT:    vzeroupper
4897; AVX512F-NEXT:    retq
4898;
4899; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3:
4900; AVX512BW-SLOW:       # %bb.0:
4901; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
4902; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4903; AVX512BW-SLOW-NEXT:    movb $73, %al
4904; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
4905; AVX512BW-SLOW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
4906; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4907; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4908; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4909; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4910; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4911; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4912; AVX512BW-SLOW-NEXT:    vzeroupper
4913; AVX512BW-SLOW-NEXT:    retq
4914;
4915; AVX512BW-FAST-LABEL: vec384_v12i32_to_v4i96_factor3:
4916; AVX512BW-FAST:       # %bb.0:
4917; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
4918; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4919; AVX512BW-FAST-NEXT:    movb $73, %al
4920; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
4921; AVX512BW-FAST-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
4922; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4923; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4924; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4925; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4926; AVX512BW-FAST-NEXT:    vzeroupper
4927; AVX512BW-FAST-NEXT:    retq
4928  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4929  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4930  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4931  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4932  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
4933  %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 1, i32 16, i32 17, i32 2, i32 19, i32 20, i32 3, i32 22, i32 23>
4934  %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
4935  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4936  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4937  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4938  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4939  ret void
4940}
4941
4942define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4943; SSE2-LABEL: vec384_v12i32_to_v3i128_factor4:
4944; SSE2:       # %bb.0:
4945; SSE2-NEXT:    movdqa (%rdi), %xmm0
4946; SSE2-NEXT:    paddb (%rsi), %xmm0
4947; SSE2-NEXT:    xorps %xmm1, %xmm1
4948; SSE2-NEXT:    xorps %xmm2, %xmm2
4949; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
4950; SSE2-NEXT:    movdqa %xmm0, %xmm3
4951; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4952; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
4953; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
4954; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
4955; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4956; SSE2-NEXT:    paddb 32(%rdx), %xmm3
4957; SSE2-NEXT:    paddb (%rdx), %xmm2
4958; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4959; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
4960; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4961; SSE2-NEXT:    retq
4962;
4963; SSE42-LABEL: vec384_v12i32_to_v3i128_factor4:
4964; SSE42:       # %bb.0:
4965; SSE42-NEXT:    movdqa (%rdi), %xmm0
4966; SSE42-NEXT:    paddb (%rsi), %xmm0
4967; SSE42-NEXT:    pxor %xmm1, %xmm1
4968; SSE42-NEXT:    pxor %xmm2, %xmm2
4969; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4970; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
4971; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5,6,7]
4972; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4973; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4974; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4975; SSE42-NEXT:    paddb 32(%rdx), %xmm3
4976; SSE42-NEXT:    paddb (%rdx), %xmm2
4977; SSE42-NEXT:    movdqa %xmm2, (%rcx)
4978; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
4979; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4980; SSE42-NEXT:    retq
4981;
4982; AVX-LABEL: vec384_v12i32_to_v3i128_factor4:
4983; AVX:       # %bb.0:
4984; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4985; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4986; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
4987; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4988; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
4989; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
4990; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4991; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
4992; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
4993; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
4994; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4995; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4996; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4997; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4998; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4999; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
5000; AVX-NEXT:    vzeroupper
5001; AVX-NEXT:    retq
5002;
5003; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
5004; AVX2-SLOW:       # %bb.0:
5005; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5006; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
5007; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
5008; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
5009; AVX2-SLOW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
5010; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
5011; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5012; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5013; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
5014; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5015; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
5016; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
5017; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
5018; AVX2-SLOW-NEXT:    vzeroupper
5019; AVX2-SLOW-NEXT:    retq
5020;
5021; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4:
5022; AVX2-FAST-PERLANE:       # %bb.0:
5023; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5024; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
5025; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
5026; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5027; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5028; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5029; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
5030; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5031; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
5032; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
5033; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
5034; AVX2-FAST-PERLANE-NEXT:    vzeroupper
5035; AVX2-FAST-PERLANE-NEXT:    retq
5036;
5037; AVX2-FAST-LABEL: vec384_v12i32_to_v3i128_factor4:
5038; AVX2-FAST:       # %bb.0:
5039; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
5040; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5041; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5042; AVX2-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0]
5043; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
5044; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
5045; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5046; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5047; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5048; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
5049; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
5050; AVX2-FAST-NEXT:    vzeroupper
5051; AVX2-FAST-NEXT:    retq
5052;
5053; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4:
5054; AVX512F:       # %bb.0:
5055; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5056; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5057; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,0,0,0,0]
5058; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5059; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
5060; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
5061; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5062; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
5063; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
5064; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
5065; AVX512F-NEXT:    vzeroupper
5066; AVX512F-NEXT:    retq
5067;
5068; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
5069; AVX512BW-SLOW:       # %bb.0:
5070; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
5071; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5072; AVX512BW-SLOW-NEXT:    movb $17, %al
5073; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
5074; AVX512BW-SLOW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
5075; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
5076; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5077; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5078; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5079; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5080; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5081; AVX512BW-SLOW-NEXT:    vzeroupper
5082; AVX512BW-SLOW-NEXT:    retq
5083;
5084; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4:
5085; AVX512BW-FAST:       # %bb.0:
5086; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
5087; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5088; AVX512BW-FAST-NEXT:    movb $17, %al
5089; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
5090; AVX512BW-FAST-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
5091; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5092; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5093; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5094; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
5095; AVX512BW-FAST-NEXT:    vzeroupper
5096; AVX512BW-FAST-NEXT:    retq
5097  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5098  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5099  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5100  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5101  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
5102  %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 1, i32 17, i32 18, i32 19, i32 2, i32 21, i32 22, i32 23>
5103  %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
5104  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5105  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5106  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5107  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5108  ret void
5109}
5110
5111define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5112; SSE2-LABEL: vec384_v12i32_to_v2i192_factor6:
5113; SSE2:       # %bb.0:
5114; SSE2-NEXT:    movdqa (%rdi), %xmm0
5115; SSE2-NEXT:    paddb (%rsi), %xmm0
5116; SSE2-NEXT:    xorps %xmm1, %xmm1
5117; SSE2-NEXT:    xorps %xmm2, %xmm2
5118; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
5119; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
5120; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
5121; SSE2-NEXT:    movaps 32(%rdx), %xmm0
5122; SSE2-NEXT:    paddb 16(%rdx), %xmm1
5123; SSE2-NEXT:    paddb (%rdx), %xmm2
5124; SSE2-NEXT:    movaps %xmm0, 32(%rcx)
5125; SSE2-NEXT:    movdqa %xmm2, (%rcx)
5126; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
5127; SSE2-NEXT:    retq
5128;
5129; SSE42-LABEL: vec384_v12i32_to_v2i192_factor6:
5130; SSE42:       # %bb.0:
5131; SSE42-NEXT:    movdqa (%rdi), %xmm0
5132; SSE42-NEXT:    paddb (%rsi), %xmm0
5133; SSE42-NEXT:    pxor %xmm1, %xmm1
5134; SSE42-NEXT:    pxor %xmm2, %xmm2
5135; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5136; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
5137; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
5138; SSE42-NEXT:    movaps 32(%rdx), %xmm1
5139; SSE42-NEXT:    paddb 16(%rdx), %xmm0
5140; SSE42-NEXT:    paddb (%rdx), %xmm2
5141; SSE42-NEXT:    movaps %xmm1, 32(%rcx)
5142; SSE42-NEXT:    movdqa %xmm2, (%rcx)
5143; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5144; SSE42-NEXT:    retq
5145;
5146; AVX-LABEL: vec384_v12i32_to_v2i192_factor6:
5147; AVX:       # %bb.0:
5148; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5149; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5150; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
5151; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
5152; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
5153; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
5154; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
5155; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
5156; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
5157; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5158; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5159; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5160; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
5161; AVX-NEXT:    vzeroupper
5162; AVX-NEXT:    retq
5163;
5164; AVX2-SLOW-LABEL: vec384_v12i32_to_v2i192_factor6:
5165; AVX2-SLOW:       # %bb.0:
5166; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5167; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
5168; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
5169; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5170; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
5171; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
5172; AVX2-SLOW-NEXT:    vmovaps 32(%rdx), %ymm1
5173; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5174; AVX2-SLOW-NEXT:    vmovaps %ymm1, 32(%rcx)
5175; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
5176; AVX2-SLOW-NEXT:    vzeroupper
5177; AVX2-SLOW-NEXT:    retq
5178;
5179; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v2i192_factor6:
5180; AVX2-FAST-PERLANE:       # %bb.0:
5181; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5182; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
5183; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
5184; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5185; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
5186; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
5187; AVX2-FAST-PERLANE-NEXT:    vmovaps 32(%rdx), %ymm1
5188; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5189; AVX2-FAST-PERLANE-NEXT:    vmovaps %ymm1, 32(%rcx)
5190; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
5191; AVX2-FAST-PERLANE-NEXT:    vzeroupper
5192; AVX2-FAST-PERLANE-NEXT:    retq
5193;
5194; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6:
5195; AVX2-FAST:       # %bb.0:
5196; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
5197; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5198; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5199; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
5200; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
5201; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
5202; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
5203; AVX2-FAST-NEXT:    vmovaps 32(%rdx), %ymm1
5204; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5205; AVX2-FAST-NEXT:    vmovaps %ymm1, 32(%rcx)
5206; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
5207; AVX2-FAST-NEXT:    vzeroupper
5208; AVX2-FAST-NEXT:    retq
5209;
5210; AVX512F-LABEL: vec384_v12i32_to_v2i192_factor6:
5211; AVX512F:       # %bb.0:
5212; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5213; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5214; AVX512F-NEXT:    movb $65, %al
5215; AVX512F-NEXT:    kmovw %eax, %k1
5216; AVX512F-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
5217; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5218; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5219; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5220; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5221; AVX512F-NEXT:    vzeroupper
5222; AVX512F-NEXT:    retq
5223;
5224; AVX512BW-LABEL: vec384_v12i32_to_v2i192_factor6:
5225; AVX512BW:       # %bb.0:
5226; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
5227; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5228; AVX512BW-NEXT:    movb $65, %al
5229; AVX512BW-NEXT:    kmovd %eax, %k1
5230; AVX512BW-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
5231; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5232; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5233; AVX512BW-NEXT:    vzeroupper
5234; AVX512BW-NEXT:    retq
5235  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5236  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5237  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5238  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5239  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
5240  %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 1, i32 19, i32 20, i32 21, i32 22, i32 23>
5241  %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
5242  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5243  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5244  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5245  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5246  ret void
5247}
5248
5249define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5250; SSE2-LABEL: vec384_v12i32_to_v1i384_factor12:
5251; SSE2:       # %bb.0:
5252; SSE2-NEXT:    movdqa (%rdi), %xmm0
5253; SSE2-NEXT:    paddb (%rsi), %xmm0
5254; SSE2-NEXT:    xorps %xmm1, %xmm1
5255; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
5256; SSE2-NEXT:    movaps 16(%rdx), %xmm0
5257; SSE2-NEXT:    movaps 32(%rdx), %xmm2
5258; SSE2-NEXT:    paddb (%rdx), %xmm1
5259; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
5260; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
5261; SSE2-NEXT:    movdqa %xmm1, (%rcx)
5262; SSE2-NEXT:    retq
5263;
5264; SSE42-LABEL: vec384_v12i32_to_v1i384_factor12:
5265; SSE42:       # %bb.0:
5266; SSE42-NEXT:    movdqa (%rdi), %xmm0
5267; SSE42-NEXT:    paddb (%rsi), %xmm0
5268; SSE42-NEXT:    pxor %xmm1, %xmm1
5269; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5270; SSE42-NEXT:    movaps 16(%rdx), %xmm0
5271; SSE42-NEXT:    movaps 32(%rdx), %xmm2
5272; SSE42-NEXT:    paddb (%rdx), %xmm1
5273; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
5274; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
5275; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5276; SSE42-NEXT:    retq
5277;
5278; AVX-LABEL: vec384_v12i32_to_v1i384_factor12:
5279; AVX:       # %bb.0:
5280; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5281; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5282; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
5283; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5284; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5285; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5286; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
5287; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
5288; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5289; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5290; AVX-NEXT:    vzeroupper
5291; AVX-NEXT:    retq
5292;
5293; AVX2-LABEL: vec384_v12i32_to_v1i384_factor12:
5294; AVX2:       # %bb.0:
5295; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5296; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5297; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5298; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5299; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
5300; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5301; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
5302; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5303; AVX2-NEXT:    vzeroupper
5304; AVX2-NEXT:    retq
5305;
5306; AVX512F-LABEL: vec384_v12i32_to_v1i384_factor12:
5307; AVX512F:       # %bb.0:
5308; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5309; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5310; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5311; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5312; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5313; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5314; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5315; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5316; AVX512F-NEXT:    vzeroupper
5317; AVX512F-NEXT:    retq
5318;
5319; AVX512BW-LABEL: vec384_v12i32_to_v1i384_factor12:
5320; AVX512BW:       # %bb.0:
5321; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5322; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5323; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5324; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5325; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5326; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5327; AVX512BW-NEXT:    vzeroupper
5328; AVX512BW-NEXT:    retq
5329  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5330  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5331  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5332  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5333  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
5334  %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
5335  %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
5336  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5337  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5338  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5339  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5340  ret void
5341}
5342
5343define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5344; SSE-LABEL: vec384_v6i64_to_v3i128_factor2:
5345; SSE:       # %bb.0:
5346; SSE-NEXT:    movdqa (%rdi), %xmm0
5347; SSE-NEXT:    movdqa 16(%rdi), %xmm1
5348; SSE-NEXT:    paddb (%rsi), %xmm0
5349; SSE-NEXT:    paddb 16(%rsi), %xmm1
5350; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
5351; SSE-NEXT:    movq {{.*#+}} xmm2 = xmm0[0],zero
5352; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
5353; SSE-NEXT:    paddb 16(%rdx), %xmm0
5354; SSE-NEXT:    paddb (%rdx), %xmm2
5355; SSE-NEXT:    paddb 32(%rdx), %xmm1
5356; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
5357; SSE-NEXT:    movdqa %xmm2, (%rcx)
5358; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
5359; SSE-NEXT:    retq
5360;
5361; AVX-LABEL: vec384_v6i64_to_v3i128_factor2:
5362; AVX:       # %bb.0:
5363; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5364; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
5365; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
5366; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5367; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
5368; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5369; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
5370; AVX-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
5371; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
5372; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
5373; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5374; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
5375; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
5376; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5377; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
5378; AVX-NEXT:    vzeroupper
5379; AVX-NEXT:    retq
5380;
5381; AVX2-LABEL: vec384_v6i64_to_v3i128_factor2:
5382; AVX2:       # %bb.0:
5383; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5384; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5385; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5386; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
5387; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
5388; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
5389; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5390; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5391; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5392; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
5393; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
5394; AVX2-NEXT:    vzeroupper
5395; AVX2-NEXT:    retq
5396;
5397; AVX512F-LABEL: vec384_v6i64_to_v3i128_factor2:
5398; AVX512F:       # %bb.0:
5399; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5400; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5401; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5402; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,0,0]
5403; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
5404; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
5405; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5406; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
5407; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
5408; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
5409; AVX512F-NEXT:    vzeroupper
5410; AVX512F-NEXT:    retq
5411;
5412; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2:
5413; AVX512BW:       # %bb.0:
5414; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
5415; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5416; AVX512BW-NEXT:    movb $5, %al
5417; AVX512BW-NEXT:    kmovd %eax, %k1
5418; AVX512BW-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
5419; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
5420; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5421; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5422; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5423; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5424; AVX512BW-NEXT:    vzeroupper
5425; AVX512BW-NEXT:    retq
5426  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5427  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5428  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5429  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5430  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64>
5431  %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 1, i32 9, i32 2, i32 11>
5432  %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8>
5433  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5434  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5435  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5436  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5437  ret void
5438}
5439
5440define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5441; SSE2-LABEL: vec384_v6i64_to_v2i192_factor3:
5442; SSE2:       # %bb.0:
5443; SSE2-NEXT:    movdqa (%rdi), %xmm0
5444; SSE2-NEXT:    paddb (%rsi), %xmm0
5445; SSE2-NEXT:    pxor %xmm1, %xmm1
5446; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5447; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
5448; SSE2-NEXT:    movaps 32(%rdx), %xmm2
5449; SSE2-NEXT:    paddb (%rdx), %xmm0
5450; SSE2-NEXT:    paddb 16(%rdx), %xmm1
5451; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
5452; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
5453; SSE2-NEXT:    movdqa %xmm0, (%rcx)
5454; SSE2-NEXT:    retq
5455;
5456; SSE42-LABEL: vec384_v6i64_to_v2i192_factor3:
5457; SSE42:       # %bb.0:
5458; SSE42-NEXT:    movdqa (%rdi), %xmm0
5459; SSE42-NEXT:    paddb (%rsi), %xmm0
5460; SSE42-NEXT:    pxor %xmm1, %xmm1
5461; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
5462; SSE42-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
5463; SSE42-NEXT:    movaps 32(%rdx), %xmm2
5464; SSE42-NEXT:    paddb (%rdx), %xmm0
5465; SSE42-NEXT:    paddb 16(%rdx), %xmm1
5466; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
5467; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
5468; SSE42-NEXT:    movdqa %xmm0, (%rcx)
5469; SSE42-NEXT:    retq
5470;
5471; AVX-LABEL: vec384_v6i64_to_v2i192_factor3:
5472; AVX:       # %bb.0:
5473; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5474; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5475; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
5476; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5477; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
5478; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
5479; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
5480; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5481; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5482; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5483; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
5484; AVX-NEXT:    vzeroupper
5485; AVX-NEXT:    retq
5486;
5487; AVX2-LABEL: vec384_v6i64_to_v2i192_factor3:
5488; AVX2:       # %bb.0:
5489; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5490; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5491; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5492; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
5493; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
5494; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
5495; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5496; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
5497; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5498; AVX2-NEXT:    vzeroupper
5499; AVX2-NEXT:    retq
5500;
5501; AVX512F-LABEL: vec384_v6i64_to_v2i192_factor3:
5502; AVX512F:       # %bb.0:
5503; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5504; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5505; AVX512F-NEXT:    movb $9, %al
5506; AVX512F-NEXT:    kmovw %eax, %k1
5507; AVX512F-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
5508; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5509; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5510; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5511; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5512; AVX512F-NEXT:    vzeroupper
5513; AVX512F-NEXT:    retq
5514;
5515; AVX512BW-LABEL: vec384_v6i64_to_v2i192_factor3:
5516; AVX512BW:       # %bb.0:
5517; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
5518; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5519; AVX512BW-NEXT:    movb $9, %al
5520; AVX512BW-NEXT:    kmovd %eax, %k1
5521; AVX512BW-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
5522; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5523; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5524; AVX512BW-NEXT:    vzeroupper
5525; AVX512BW-NEXT:    retq
5526  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5527  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5528  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5529  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5530  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64>
5531  %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 1, i32 10, i32 11>
5532  %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8>
5533  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5534  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5535  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5536  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5537  ret void
5538}
5539
5540define void @vec384_v6i64_to_v1i384_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5541; SSE-LABEL: vec384_v6i64_to_v1i384_factor6:
5542; SSE:       # %bb.0:
5543; SSE-NEXT:    movdqa (%rdi), %xmm0
5544; SSE-NEXT:    paddb (%rsi), %xmm0
5545; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
5546; SSE-NEXT:    movaps 16(%rdx), %xmm1
5547; SSE-NEXT:    movaps 32(%rdx), %xmm2
5548; SSE-NEXT:    paddb (%rdx), %xmm0
5549; SSE-NEXT:    movaps %xmm1, 16(%rcx)
5550; SSE-NEXT:    movaps %xmm2, 32(%rcx)
5551; SSE-NEXT:    movdqa %xmm0, (%rcx)
5552; SSE-NEXT:    retq
5553;
5554; AVX-LABEL: vec384_v6i64_to_v1i384_factor6:
5555; AVX:       # %bb.0:
5556; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5557; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5558; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
5559; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5560; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5561; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
5562; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
5563; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5564; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5565; AVX-NEXT:    vzeroupper
5566; AVX-NEXT:    retq
5567;
5568; AVX2-LABEL: vec384_v6i64_to_v1i384_factor6:
5569; AVX2:       # %bb.0:
5570; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5571; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5572; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5573; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
5574; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5575; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
5576; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5577; AVX2-NEXT:    vzeroupper
5578; AVX2-NEXT:    retq
5579;
5580; AVX512F-LABEL: vec384_v6i64_to_v1i384_factor6:
5581; AVX512F:       # %bb.0:
5582; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5583; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5584; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5585; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5586; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5587; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5588; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5589; AVX512F-NEXT:    vzeroupper
5590; AVX512F-NEXT:    retq
5591;
5592; AVX512BW-LABEL: vec384_v6i64_to_v1i384_factor6:
5593; AVX512BW:       # %bb.0:
5594; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5595; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5596; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5597; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5598; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5599; AVX512BW-NEXT:    vzeroupper
5600; AVX512BW-NEXT:    retq
5601  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5602  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5603  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5604  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5605  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64>
5606  %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 9, i32 10, i32 11>
5607  %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8>
5608  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5609  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5610  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5611  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5612  ret void
5613}
5614
5615define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5616; SSE-LABEL: vec384_v3i128_to_v1i384_factor3:
5617; SSE:       # %bb.0:
5618; SSE-NEXT:    movdqa (%rdi), %xmm0
5619; SSE-NEXT:    paddb (%rsi), %xmm0
5620; SSE-NEXT:    movaps 16(%rdx), %xmm1
5621; SSE-NEXT:    movaps 32(%rdx), %xmm2
5622; SSE-NEXT:    paddb (%rdx), %xmm0
5623; SSE-NEXT:    movaps %xmm1, 16(%rcx)
5624; SSE-NEXT:    movaps %xmm2, 32(%rcx)
5625; SSE-NEXT:    movdqa %xmm0, (%rcx)
5626; SSE-NEXT:    retq
5627;
5628; AVX-LABEL: vec384_v3i128_to_v1i384_factor3:
5629; AVX:       # %bb.0:
5630; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5631; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5632; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
5633; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5634; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
5635; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
5636; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5637; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5638; AVX-NEXT:    vzeroupper
5639; AVX-NEXT:    retq
5640;
5641; AVX2-LABEL: vec384_v3i128_to_v1i384_factor3:
5642; AVX2:       # %bb.0:
5643; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5644; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5645; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
5646; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5647; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
5648; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5649; AVX2-NEXT:    vzeroupper
5650; AVX2-NEXT:    retq
5651;
5652; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3:
5653; AVX512F:       # %bb.0:
5654; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5655; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5656; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5657; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5658; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5659; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5660; AVX512F-NEXT:    vzeroupper
5661; AVX512F-NEXT:    retq
5662;
5663; AVX512BW-LABEL: vec384_v3i128_to_v1i384_factor3:
5664; AVX512BW:       # %bb.0:
5665; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5666; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5667; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5668; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5669; AVX512BW-NEXT:    vzeroupper
5670; AVX512BW-NEXT:    retq
5671  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5672  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5673  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5674  %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5675  %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <3 x i128>
5676  %zextd.vec = shufflevector <3 x i128> %in.vec.cast, <3 x i128> zeroinitializer, <3 x i32> <i32 0, i32 4, i32 5>
5677  %out.bytevec = bitcast <3 x i128> %zextd.vec to <48 x i8>
5678  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5679  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5680  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5681  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5682  ret void
5683}
5684
5685define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5686; SSE2-LABEL: vec512_v64i8_to_v32i16_factor2:
5687; SSE2:       # %bb.0:
5688; SSE2-NEXT:    movdqa (%rdi), %xmm0
5689; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
5690; SSE2-NEXT:    paddb (%rsi), %xmm0
5691; SSE2-NEXT:    paddb 16(%rsi), %xmm1
5692; SSE2-NEXT:    pxor %xmm2, %xmm2
5693; SSE2-NEXT:    movdqa %xmm1, %xmm3
5694; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
5695; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
5696; SSE2-NEXT:    movdqa %xmm0, %xmm4
5697; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
5698; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
5699; SSE2-NEXT:    paddb 16(%rdx), %xmm0
5700; SSE2-NEXT:    paddb (%rdx), %xmm4
5701; SSE2-NEXT:    paddb 48(%rdx), %xmm1
5702; SSE2-NEXT:    paddb 32(%rdx), %xmm3
5703; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
5704; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
5705; SSE2-NEXT:    movdqa %xmm4, (%rcx)
5706; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
5707; SSE2-NEXT:    retq
5708;
5709; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2:
5710; SSE42:       # %bb.0:
5711; SSE42-NEXT:    movdqa (%rdi), %xmm0
5712; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
5713; SSE42-NEXT:    paddb (%rsi), %xmm0
5714; SSE42-NEXT:    paddb 16(%rsi), %xmm1
5715; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
5716; SSE42-NEXT:    pxor %xmm3, %xmm3
5717; SSE42-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
5718; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
5719; SSE42-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
5720; SSE42-NEXT:    paddb 16(%rdx), %xmm0
5721; SSE42-NEXT:    paddb (%rdx), %xmm4
5722; SSE42-NEXT:    paddb 48(%rdx), %xmm1
5723; SSE42-NEXT:    paddb 32(%rdx), %xmm2
5724; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
5725; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
5726; SSE42-NEXT:    movdqa %xmm4, (%rcx)
5727; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5728; SSE42-NEXT:    retq
5729;
5730; AVX-LABEL: vec512_v64i8_to_v32i16_factor2:
5731; AVX:       # %bb.0:
5732; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5733; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
5734; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
5735; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5736; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
5737; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
5738; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
5739; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
5740; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
5741; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
5742; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, %xmm3
5743; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
5744; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
5745; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
5746; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
5747; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
5748; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5749; AVX-NEXT:    retq
5750;
5751; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2:
5752; AVX2:       # %bb.0:
5753; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5754; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5755; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5756; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
5757; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5758; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5759; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5760; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
5761; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
5762; AVX2-NEXT:    vzeroupper
5763; AVX2-NEXT:    retq
5764;
5765; AVX512F-LABEL: vec512_v64i8_to_v32i16_factor2:
5766; AVX512F:       # %bb.0:
5767; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5768; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5769; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5770; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
5771; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5772; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5773; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5774; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
5775; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
5776; AVX512F-NEXT:    vzeroupper
5777; AVX512F-NEXT:    retq
5778;
5779; AVX512BW-LABEL: vec512_v64i8_to_v32i16_factor2:
5780; AVX512BW:       # %bb.0:
5781; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
5782; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5783; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
5784; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5785; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5786; AVX512BW-NEXT:    vzeroupper
5787; AVX512BW-NEXT:    retq
5788  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5789  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5790  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5791  %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 1, i32 67, i32 2, i32 69, i32 3, i32 71, i32 4, i32 73, i32 5, i32 75, i32 6, i32 77, i32 7, i32 79, i32 8, i32 81, i32 9, i32 83, i32 10, i32 85, i32 11, i32 87, i32 12, i32 89, i32 13, i32 91, i32 14, i32 93, i32 15, i32 95, i32 16, i32 97, i32 17, i32 99, i32 18, i32 101, i32 19, i32 103, i32 20, i32 105, i32 21, i32 107, i32 22, i32 109, i32 23, i32 111, i32 24, i32 113, i32 25, i32 115, i32 26, i32 117, i32 27, i32 119, i32 28, i32 121, i32 29, i32 123, i32 30, i32 125, i32 31, i32 127>
5792  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5793  %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
5794  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5795  ret void
5796}
5797
5798define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5799; SSE2-LABEL: vec512_v64i8_to_v16i32_factor4:
5800; SSE2:       # %bb.0:
5801; SSE2-NEXT:    movdqa (%rdi), %xmm0
5802; SSE2-NEXT:    paddb (%rsi), %xmm0
5803; SSE2-NEXT:    pxor %xmm1, %xmm1
5804; SSE2-NEXT:    movdqa %xmm0, %xmm2
5805; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
5806; SSE2-NEXT:    movdqa %xmm2, %xmm3
5807; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
5808; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5809; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5810; SSE2-NEXT:    movdqa %xmm0, %xmm4
5811; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5812; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5813; SSE2-NEXT:    paddb 16(%rdx), %xmm0
5814; SSE2-NEXT:    paddb (%rdx), %xmm4
5815; SSE2-NEXT:    paddb 48(%rdx), %xmm2
5816; SSE2-NEXT:    paddb 32(%rdx), %xmm3
5817; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
5818; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
5819; SSE2-NEXT:    movdqa %xmm4, (%rcx)
5820; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
5821; SSE2-NEXT:    retq
5822;
5823; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4:
5824; SSE42:       # %bb.0:
5825; SSE42-NEXT:    movdqa (%rdi), %xmm0
5826; SSE42-NEXT:    paddb (%rsi), %xmm0
5827; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5828; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
5829; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
5830; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
5831; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
5832; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
5833; SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5834; SSE42-NEXT:    paddb 16(%rdx), %xmm0
5835; SSE42-NEXT:    paddb 48(%rdx), %xmm3
5836; SSE42-NEXT:    paddb 32(%rdx), %xmm2
5837; SSE42-NEXT:    paddb (%rdx), %xmm1
5838; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5839; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
5840; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
5841; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5842; SSE42-NEXT:    retq
5843;
5844; AVX-LABEL: vec512_v64i8_to_v16i32_factor4:
5845; AVX:       # %bb.0:
5846; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5847; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5848; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5849; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
5850; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
5851; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
5852; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
5853; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5854; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5855; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
5856; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
5857; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
5858; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
5859; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
5860; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
5861; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
5862; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
5863; AVX-NEXT:    retq
5864;
5865; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4:
5866; AVX2:       # %bb.0:
5867; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5868; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5869; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5870; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
5871; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5872; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5873; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5874; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
5875; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
5876; AVX2-NEXT:    vzeroupper
5877; AVX2-NEXT:    retq
5878;
5879; AVX512F-LABEL: vec512_v64i8_to_v16i32_factor4:
5880; AVX512F:       # %bb.0:
5881; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5882; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5883; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5884; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
5885; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
5886; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5887; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5888; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5889; AVX512F-NEXT:    vzeroupper
5890; AVX512F-NEXT:    retq
5891;
5892; AVX512BW-LABEL: vec512_v64i8_to_v16i32_factor4:
5893; AVX512BW:       # %bb.0:
5894; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5895; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5896; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5897; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5898; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5899; AVX512BW-NEXT:    vzeroupper
5900; AVX512BW-NEXT:    retq
5901  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5902  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5903  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5904  %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 1, i32 69, i32 70, i32 71, i32 2, i32 73, i32 74, i32 75, i32 3, i32 77, i32 78, i32 79, i32 4, i32 81, i32 82, i32 83, i32 5, i32 85, i32 86, i32 87, i32 6, i32 89, i32 90, i32 91, i32 7, i32 93, i32 94, i32 95, i32 8, i32 97, i32 98, i32 99, i32 9, i32 101, i32 102, i32 103, i32 10, i32 105, i32 106, i32 107, i32 11, i32 109, i32 110, i32 111, i32 12, i32 113, i32 114, i32 115, i32 13, i32 117, i32 118, i32 119, i32 14, i32 121, i32 122, i32 123, i32 15, i32 125, i32 126, i32 127>
5905  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5906  %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
5907  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5908  ret void
5909}
5910
5911define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5912; SSE2-LABEL: vec512_v64i8_to_v8i64_factor8:
5913; SSE2:       # %bb.0:
5914; SSE2-NEXT:    movdqa (%rdi), %xmm0
5915; SSE2-NEXT:    paddb (%rsi), %xmm0
5916; SSE2-NEXT:    pxor %xmm1, %xmm1
5917; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5918; SSE2-NEXT:    movdqa %xmm0, %xmm2
5919; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5920; SSE2-NEXT:    movdqa %xmm2, %xmm3
5921; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5922; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5923; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5924; SSE2-NEXT:    movdqa %xmm0, %xmm4
5925; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5926; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5927; SSE2-NEXT:    paddb 16(%rdx), %xmm0
5928; SSE2-NEXT:    paddb (%rdx), %xmm4
5929; SSE2-NEXT:    paddb 48(%rdx), %xmm2
5930; SSE2-NEXT:    paddb 32(%rdx), %xmm3
5931; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
5932; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
5933; SSE2-NEXT:    movdqa %xmm4, (%rcx)
5934; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
5935; SSE2-NEXT:    retq
5936;
5937; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8:
5938; SSE42:       # %bb.0:
5939; SSE42-NEXT:    movdqa (%rdi), %xmm0
5940; SSE42-NEXT:    paddb (%rsi), %xmm0
5941; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5942; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
5943; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
5944; SSE42-NEXT:    movdqa %xmm0, %xmm3
5945; SSE42-NEXT:    psrlq $48, %xmm3
5946; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
5947; SSE42-NEXT:    psrld $16, %xmm0
5948; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5949; SSE42-NEXT:    paddb 16(%rdx), %xmm0
5950; SSE42-NEXT:    paddb 48(%rdx), %xmm3
5951; SSE42-NEXT:    paddb 32(%rdx), %xmm2
5952; SSE42-NEXT:    paddb (%rdx), %xmm1
5953; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5954; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
5955; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
5956; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5957; SSE42-NEXT:    retq
5958;
5959; AVX-LABEL: vec512_v64i8_to_v8i64_factor8:
5960; AVX:       # %bb.0:
5961; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5962; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5963; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5964; AVX-NEXT:    vpsrld $16, %xmm0, %xmm2
5965; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
5966; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
5967; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
5968; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
5969; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5970; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
5971; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
5972; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
5973; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
5974; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
5975; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
5976; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
5977; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
5978; AVX-NEXT:    retq
5979;
5980; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8:
5981; AVX2:       # %bb.0:
5982; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5983; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5984; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
5985; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
5986; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
5987; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5988; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5989; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
5990; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
5991; AVX2-NEXT:    vzeroupper
5992; AVX2-NEXT:    retq
5993;
5994; AVX512F-LABEL: vec512_v64i8_to_v8i64_factor8:
5995; AVX512F:       # %bb.0:
5996; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5997; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5998; AVX512F-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
5999; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6000; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
6001; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6002; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6003; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6004; AVX512F-NEXT:    vzeroupper
6005; AVX512F-NEXT:    retq
6006;
6007; AVX512BW-LABEL: vec512_v64i8_to_v8i64_factor8:
6008; AVX512BW:       # %bb.0:
6009; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
6010; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6011; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
6012; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6013; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6014; AVX512BW-NEXT:    vzeroupper
6015; AVX512BW-NEXT:    retq
6016  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6017  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6018  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6019  %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 1, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 3, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 4, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 5, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 6, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 7, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6020  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6021  %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6022  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6023  ret void
6024}
6025
6026define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6027; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16:
6028; SSE2:       # %bb.0:
6029; SSE2-NEXT:    movdqa (%rdi), %xmm0
6030; SSE2-NEXT:    paddb (%rsi), %xmm0
6031; SSE2-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
6032; SSE2-NEXT:    pand %xmm0, %xmm1
6033; SSE2-NEXT:    movdqa %xmm0, %xmm2
6034; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
6035; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6036; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6037; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6038; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6039; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6040; SSE2-NEXT:    paddb 16(%rdx), %xmm0
6041; SSE2-NEXT:    paddb 48(%rdx), %xmm3
6042; SSE2-NEXT:    paddb 32(%rdx), %xmm2
6043; SSE2-NEXT:    paddb (%rdx), %xmm1
6044; SSE2-NEXT:    movdqa %xmm1, (%rcx)
6045; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
6046; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
6047; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
6048; SSE2-NEXT:    retq
6049;
6050; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16:
6051; SSE42:       # %bb.0:
6052; SSE42-NEXT:    movdqa (%rdi), %xmm0
6053; SSE42-NEXT:    paddb (%rsi), %xmm0
6054; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = [255,0]
6055; SSE42-NEXT:    pand %xmm0, %xmm1
6056; SSE42-NEXT:    movdqa %xmm0, %xmm2
6057; SSE42-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
6058; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6059; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6060; SSE42-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6061; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6062; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6063; SSE42-NEXT:    paddb 16(%rdx), %xmm0
6064; SSE42-NEXT:    paddb 48(%rdx), %xmm3
6065; SSE42-NEXT:    paddb 32(%rdx), %xmm2
6066; SSE42-NEXT:    paddb (%rdx), %xmm1
6067; SSE42-NEXT:    movdqa %xmm1, (%rcx)
6068; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
6069; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6070; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
6071; SSE42-NEXT:    retq
6072;
6073; AVX-LABEL: vec512_v64i8_to_v4i128_factor16:
6074; AVX:       # %bb.0:
6075; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6076; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6077; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
6078; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6079; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6080; AVX-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
6081; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6082; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6083; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6084; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
6085; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
6086; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
6087; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
6088; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
6089; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
6090; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6091; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
6092; AVX-NEXT:    retq
6093;
6094; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
6095; AVX2-SLOW:       # %bb.0:
6096; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
6097; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6098; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6099; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6100; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6101; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
6102; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm1, %ymm1
6103; AVX2-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
6104; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6105; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6106; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
6107; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6108; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6109; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
6110; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
6111; AVX2-SLOW-NEXT:    vzeroupper
6112; AVX2-SLOW-NEXT:    retq
6113;
6114; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16:
6115; AVX2-FAST-PERLANE:       # %bb.0:
6116; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
6117; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6118; AVX2-FAST-PERLANE-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6119; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6120; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6121; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
6122; AVX2-FAST-PERLANE-NEXT:    vpand %ymm2, %ymm1, %ymm1
6123; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6124; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6125; AVX2-FAST-PERLANE-NEXT:    vpand %ymm2, %ymm0, %ymm0
6126; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6127; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6128; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
6129; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
6130; AVX2-FAST-PERLANE-NEXT:    vzeroupper
6131; AVX2-FAST-PERLANE-NEXT:    retq
6132;
6133; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
6134; AVX2-FAST:       # %bb.0:
6135; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
6136; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6137; AVX2-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6138; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6139; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6140; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
6141; AVX2-FAST-NEXT:    vpand %ymm2, %ymm1, %ymm1
6142; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6143; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6144; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
6145; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6146; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6147; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
6148; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
6149; AVX2-FAST-NEXT:    vzeroupper
6150; AVX2-FAST-NEXT:    retq
6151;
6152; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
6153; AVX512F-SLOW:       # %bb.0:
6154; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
6155; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6156; AVX512F-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6157; AVX512F-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
6158; AVX512F-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6159; AVX512F-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
6160; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6161; AVX512F-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6162; AVX512F-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6163; AVX512F-SLOW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
6164; AVX512F-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6165; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
6166; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6167; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
6168; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
6169; AVX512F-SLOW-NEXT:    vzeroupper
6170; AVX512F-SLOW-NEXT:    retq
6171;
6172; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
6173; AVX512F-FAST:       # %bb.0:
6174; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
6175; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6176; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6177; AVX512F-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6178; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6179; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6180; AVX512F-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6181; AVX512F-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6182; AVX512F-FAST-NEXT:    vpandq %zmm1, %zmm0, %zmm0
6183; AVX512F-FAST-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6184; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
6185; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6186; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
6187; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
6188; AVX512F-FAST-NEXT:    vzeroupper
6189; AVX512F-FAST-NEXT:    retq
6190;
6191; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
6192; AVX512BW-SLOW:       # %bb.0:
6193; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
6194; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6195; AVX512BW-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6196; AVX512BW-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
6197; AVX512BW-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6198; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
6199; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6200; AVX512BW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6201; AVX512BW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6202; AVX512BW-SLOW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
6203; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6204; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6205; AVX512BW-SLOW-NEXT:    vzeroupper
6206; AVX512BW-SLOW-NEXT:    retq
6207;
6208; AVX512BW-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
6209; AVX512BW-FAST:       # %bb.0:
6210; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
6211; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6212; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6213; AVX512BW-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6214; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6215; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6216; AVX512BW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6217; AVX512BW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6218; AVX512BW-FAST-NEXT:    vpandq %zmm1, %zmm0, %zmm0
6219; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6220; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
6221; AVX512BW-FAST-NEXT:    vzeroupper
6222; AVX512BW-FAST-NEXT:    retq
6223  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6224  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6225  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6226  %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 1, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 2, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 3, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6227  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6228  %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6229  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6230  ret void
6231}
6232
6233define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6234; SSE2-LABEL: vec512_v64i8_to_v2i256_factor32:
6235; SSE2:       # %bb.0:
6236; SSE2-NEXT:    movdqa (%rdi), %xmm0
6237; SSE2-NEXT:    paddb (%rsi), %xmm0
6238; SSE2-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
6239; SSE2-NEXT:    pand %xmm0, %xmm1
6240; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6241; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6242; SSE2-NEXT:    movaps 16(%rdx), %xmm2
6243; SSE2-NEXT:    movaps 48(%rdx), %xmm3
6244; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6245; SSE2-NEXT:    paddb (%rdx), %xmm1
6246; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
6247; SSE2-NEXT:    movaps %xmm2, 16(%rcx)
6248; SSE2-NEXT:    movdqa %xmm1, (%rcx)
6249; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6250; SSE2-NEXT:    retq
6251;
6252; SSE42-LABEL: vec512_v64i8_to_v2i256_factor32:
6253; SSE42:       # %bb.0:
6254; SSE42-NEXT:    movdqa (%rdi), %xmm0
6255; SSE42-NEXT:    paddb (%rsi), %xmm0
6256; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = [255,0]
6257; SSE42-NEXT:    pand %xmm0, %xmm1
6258; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6259; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6260; SSE42-NEXT:    movaps 16(%rdx), %xmm2
6261; SSE42-NEXT:    movaps 48(%rdx), %xmm3
6262; SSE42-NEXT:    paddb 32(%rdx), %xmm0
6263; SSE42-NEXT:    paddb (%rdx), %xmm1
6264; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
6265; SSE42-NEXT:    movaps %xmm2, 16(%rcx)
6266; SSE42-NEXT:    movdqa %xmm1, (%rcx)
6267; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
6268; SSE42-NEXT:    retq
6269;
6270; AVX-LABEL: vec512_v64i8_to_v2i256_factor32:
6271; AVX:       # %bb.0:
6272; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6273; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6274; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6275; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6276; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
6277; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
6278; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6279; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6280; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
6281; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6282; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
6283; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6284; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
6285; AVX-NEXT:    retq
6286;
6287; AVX2-LABEL: vec512_v64i8_to_v2i256_factor32:
6288; AVX2:       # %bb.0:
6289; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6290; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6291; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6292; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm1
6293; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6294; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6295; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6296; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
6297; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
6298; AVX2-NEXT:    vzeroupper
6299; AVX2-NEXT:    retq
6300;
6301; AVX512F-LABEL: vec512_v64i8_to_v2i256_factor32:
6302; AVX512F:       # %bb.0:
6303; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6304; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6305; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6306; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm1
6307; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6308; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6309; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6310; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
6311; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
6312; AVX512F-NEXT:    vzeroupper
6313; AVX512F-NEXT:    retq
6314;
6315; AVX512BW-LABEL: vec512_v64i8_to_v2i256_factor32:
6316; AVX512BW:       # %bb.0:
6317; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
6318; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6319; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6320; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm1
6321; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6322; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
6323; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6324; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6325; AVX512BW-NEXT:    vzeroupper
6326; AVX512BW-NEXT:    retq
6327  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6328  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6329  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6330  %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 1, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6331  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6332  %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6333  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6334  ret void
6335}
6336
6337define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6338; SSE-LABEL: vec512_v64i8_to_v1i512_factor64:
6339; SSE:       # %bb.0:
6340; SSE-NEXT:    movdqa (%rdi), %xmm0
6341; SSE-NEXT:    paddb (%rsi), %xmm0
6342; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6343; SSE-NEXT:    movaps 16(%rdx), %xmm1
6344; SSE-NEXT:    movaps 32(%rdx), %xmm2
6345; SSE-NEXT:    movaps 48(%rdx), %xmm3
6346; SSE-NEXT:    paddb (%rdx), %xmm0
6347; SSE-NEXT:    movaps %xmm2, 32(%rcx)
6348; SSE-NEXT:    movaps %xmm3, 48(%rcx)
6349; SSE-NEXT:    movaps %xmm1, 16(%rcx)
6350; SSE-NEXT:    movdqa %xmm0, (%rcx)
6351; SSE-NEXT:    retq
6352;
6353; AVX-LABEL: vec512_v64i8_to_v1i512_factor64:
6354; AVX:       # %bb.0:
6355; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6356; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6357; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
6358; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
6359; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6360; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6361; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6362; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
6363; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6364; AVX-NEXT:    vzeroupper
6365; AVX-NEXT:    retq
6366;
6367; AVX2-LABEL: vec512_v64i8_to_v1i512_factor64:
6368; AVX2:       # %bb.0:
6369; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6370; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6371; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6372; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
6373; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
6374; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6375; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
6376; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6377; AVX2-NEXT:    vzeroupper
6378; AVX2-NEXT:    retq
6379;
6380; AVX512F-LABEL: vec512_v64i8_to_v1i512_factor64:
6381; AVX512F:       # %bb.0:
6382; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6383; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6384; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6385; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
6386; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6387; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
6388; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
6389; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6390; AVX512F-NEXT:    vzeroupper
6391; AVX512F-NEXT:    retq
6392;
6393; AVX512BW-LABEL: vec512_v64i8_to_v1i512_factor64:
6394; AVX512BW:       # %bb.0:
6395; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6396; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6397; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6398; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
6399; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6400; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6401; AVX512BW-NEXT:    vzeroupper
6402; AVX512BW-NEXT:    retq
6403  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6404  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6405  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6406  %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6407  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6408  %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6409  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6410  ret void
6411}
6412
6413define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6414; SSE2-LABEL: vec512_v32i16_to_v16i32_factor2:
6415; SSE2:       # %bb.0:
6416; SSE2-NEXT:    movdqa (%rdi), %xmm0
6417; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
6418; SSE2-NEXT:    paddb (%rsi), %xmm0
6419; SSE2-NEXT:    paddb 16(%rsi), %xmm1
6420; SSE2-NEXT:    pxor %xmm2, %xmm2
6421; SSE2-NEXT:    movdqa %xmm1, %xmm3
6422; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6423; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
6424; SSE2-NEXT:    movdqa %xmm0, %xmm4
6425; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6426; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6427; SSE2-NEXT:    paddb 16(%rdx), %xmm0
6428; SSE2-NEXT:    paddb (%rdx), %xmm4
6429; SSE2-NEXT:    paddb 48(%rdx), %xmm1
6430; SSE2-NEXT:    paddb 32(%rdx), %xmm3
6431; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
6432; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
6433; SSE2-NEXT:    movdqa %xmm4, (%rcx)
6434; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
6435; SSE2-NEXT:    retq
6436;
6437; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2:
6438; SSE42:       # %bb.0:
6439; SSE42-NEXT:    movdqa (%rdi), %xmm0
6440; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
6441; SSE42-NEXT:    paddb (%rsi), %xmm0
6442; SSE42-NEXT:    paddb 16(%rsi), %xmm1
6443; SSE42-NEXT:    pxor %xmm2, %xmm2
6444; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
6445; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
6446; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6447; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6448; SSE42-NEXT:    paddb 16(%rdx), %xmm0
6449; SSE42-NEXT:    paddb (%rdx), %xmm4
6450; SSE42-NEXT:    paddb 48(%rdx), %xmm1
6451; SSE42-NEXT:    paddb 32(%rdx), %xmm3
6452; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
6453; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
6454; SSE42-NEXT:    movdqa %xmm4, (%rcx)
6455; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
6456; SSE42-NEXT:    retq
6457;
6458; AVX-LABEL: vec512_v32i16_to_v16i32_factor2:
6459; AVX:       # %bb.0:
6460; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6461; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
6462; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
6463; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6464; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6465; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
6466; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
6467; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
6468; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6469; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
6470; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, %xmm3
6471; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
6472; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
6473; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
6474; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
6475; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6476; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
6477; AVX-NEXT:    retq
6478;
6479; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2:
6480; AVX2:       # %bb.0:
6481; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6482; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6483; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
6484; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
6485; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
6486; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6487; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6488; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
6489; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
6490; AVX2-NEXT:    vzeroupper
6491; AVX2-NEXT:    retq
6492;
6493; AVX512F-LABEL: vec512_v32i16_to_v16i32_factor2:
6494; AVX512F:       # %bb.0:
6495; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6496; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6497; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
6498; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6499; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
6500; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6501; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6502; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6503; AVX512F-NEXT:    vzeroupper
6504; AVX512F-NEXT:    retq
6505;
6506; AVX512BW-LABEL: vec512_v32i16_to_v16i32_factor2:
6507; AVX512BW:       # %bb.0:
6508; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
6509; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6510; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
6511; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6512; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6513; AVX512BW-NEXT:    vzeroupper
6514; AVX512BW-NEXT:    retq
6515  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6516  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6517  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6518  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6519  %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 1, i32 35, i32 2, i32 37, i32 3, i32 39, i32 4, i32 41, i32 5, i32 43, i32 6, i32 45, i32 7, i32 47, i32 8, i32 49, i32 9, i32 51, i32 10, i32 53, i32 11, i32 55, i32 12, i32 57, i32 13, i32 59, i32 14, i32 61, i32 15, i32 63>
6520  %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6521  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6522  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6523  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6524  ret void
6525}
6526
6527define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6528; SSE2-LABEL: vec512_v32i16_to_v8i64_factor4:
6529; SSE2:       # %bb.0:
6530; SSE2-NEXT:    movdqa (%rdi), %xmm0
6531; SSE2-NEXT:    paddb (%rsi), %xmm0
6532; SSE2-NEXT:    pxor %xmm1, %xmm1
6533; SSE2-NEXT:    movdqa %xmm0, %xmm2
6534; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6535; SSE2-NEXT:    movdqa %xmm2, %xmm3
6536; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
6537; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6538; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6539; SSE2-NEXT:    movdqa %xmm0, %xmm4
6540; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6541; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6542; SSE2-NEXT:    paddb 16(%rdx), %xmm0
6543; SSE2-NEXT:    paddb (%rdx), %xmm4
6544; SSE2-NEXT:    paddb 48(%rdx), %xmm2
6545; SSE2-NEXT:    paddb 32(%rdx), %xmm3
6546; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
6547; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
6548; SSE2-NEXT:    movdqa %xmm4, (%rcx)
6549; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
6550; SSE2-NEXT:    retq
6551;
6552; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4:
6553; SSE42:       # %bb.0:
6554; SSE42-NEXT:    movdqa (%rdi), %xmm0
6555; SSE42-NEXT:    paddb (%rsi), %xmm0
6556; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6557; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
6558; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6559; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
6560; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6561; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
6562; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6563; SSE42-NEXT:    paddb 16(%rdx), %xmm0
6564; SSE42-NEXT:    paddb 48(%rdx), %xmm3
6565; SSE42-NEXT:    paddb 32(%rdx), %xmm2
6566; SSE42-NEXT:    paddb (%rdx), %xmm1
6567; SSE42-NEXT:    movdqa %xmm1, (%rcx)
6568; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
6569; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6570; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
6571; SSE42-NEXT:    retq
6572;
6573; AVX-LABEL: vec512_v32i16_to_v8i64_factor4:
6574; AVX:       # %bb.0:
6575; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6576; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6577; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6578; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
6579; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6580; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
6581; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6582; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
6583; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6584; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
6585; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
6586; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
6587; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
6588; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
6589; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
6590; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6591; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
6592; AVX-NEXT:    retq
6593;
6594; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4:
6595; AVX2:       # %bb.0:
6596; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6597; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6598; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
6599; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6600; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
6601; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6602; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6603; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
6604; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
6605; AVX2-NEXT:    vzeroupper
6606; AVX2-NEXT:    retq
6607;
6608; AVX512F-LABEL: vec512_v32i16_to_v8i64_factor4:
6609; AVX512F:       # %bb.0:
6610; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
6611; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6612; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
6613; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6614; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
6615; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6616; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6617; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6618; AVX512F-NEXT:    vzeroupper
6619; AVX512F-NEXT:    retq
6620;
6621; AVX512BW-LABEL: vec512_v32i16_to_v8i64_factor4:
6622; AVX512BW:       # %bb.0:
6623; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
6624; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6625; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
6626; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6627; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6628; AVX512BW-NEXT:    vzeroupper
6629; AVX512BW-NEXT:    retq
6630  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6631  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6632  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6633  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6634  %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 3, i32 45, i32 46, i32 47, i32 4, i32 49, i32 50, i32 51, i32 5, i32 53, i32 54, i32 55, i32 6, i32 57, i32 58, i32 59, i32 7, i32 61, i32 62, i32 63>
6635  %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6636  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6637  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6638  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6639  ret void
6640}
6641
6642define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6643; SSE2-LABEL: vec512_v32i16_to_v4i128_factor8:
6644; SSE2:       # %bb.0:
6645; SSE2-NEXT:    movdqa (%rdi), %xmm0
6646; SSE2-NEXT:    paddb (%rsi), %xmm0
6647; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
6648; SSE2-NEXT:    pand %xmm0, %xmm1
6649; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
6650; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6651; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6652; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6653; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6654; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6655; SSE2-NEXT:    paddb 16(%rdx), %xmm3
6656; SSE2-NEXT:    paddb 48(%rdx), %xmm2
6657; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6658; SSE2-NEXT:    paddb (%rdx), %xmm1
6659; SSE2-NEXT:    movdqa %xmm1, (%rcx)
6660; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6661; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
6662; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
6663; SSE2-NEXT:    retq
6664;
6665; SSE42-LABEL: vec512_v32i16_to_v4i128_factor8:
6666; SSE42:       # %bb.0:
6667; SSE42-NEXT:    movdqa (%rdi), %xmm0
6668; SSE42-NEXT:    paddb (%rsi), %xmm0
6669; SSE42-NEXT:    pxor %xmm1, %xmm1
6670; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6671; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
6672; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6673; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6674; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6675; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6676; SSE42-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6677; SSE42-NEXT:    paddb 16(%rdx), %xmm3
6678; SSE42-NEXT:    paddb 48(%rdx), %xmm2
6679; SSE42-NEXT:    paddb 32(%rdx), %xmm0
6680; SSE42-NEXT:    paddb (%rdx), %xmm1
6681; SSE42-NEXT:    movdqa %xmm1, (%rcx)
6682; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
6683; SSE42-NEXT:    movdqa %xmm2, 48(%rcx)
6684; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
6685; SSE42-NEXT:    retq
6686;
6687; AVX-LABEL: vec512_v32i16_to_v4i128_factor8:
6688; AVX:       # %bb.0:
6689; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6690; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6691; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6692; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6693; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
6694; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6695; AVX-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6696; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6697; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6698; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6699; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
6700; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
6701; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
6702; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
6703; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
6704; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
6705; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6706; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
6707; AVX-NEXT:    retq
6708;
6709; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8:
6710; AVX2-SLOW:       # %bb.0:
6711; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
6712; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
6713; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
6714; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6715; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
6716; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6717; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
6718; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6719; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6720; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6721; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6722; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6723; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
6724; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
6725; AVX2-SLOW-NEXT:    vzeroupper
6726; AVX2-SLOW-NEXT:    retq
6727;
6728; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8:
6729; AVX2-FAST-PERLANE:       # %bb.0:
6730; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
6731; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
6732; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
6733; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6734; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
6735; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6736; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
6737; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6738; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6739; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6740; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6741; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
6742; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
6743; AVX2-FAST-PERLANE-NEXT:    vzeroupper
6744; AVX2-FAST-PERLANE-NEXT:    retq
6745;
6746; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8:
6747; AVX2-FAST:       # %bb.0:
6748; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
6749; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm1
6750; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
6751; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6752; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
6753; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6754; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
6755; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6756; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6757; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6758; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6759; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
6760; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
6761; AVX2-FAST-NEXT:    vzeroupper
6762; AVX2-FAST-NEXT:    retq
6763;
6764; AVX512F-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8:
6765; AVX512F-SLOW:       # %bb.0:
6766; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
6767; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6768; AVX512F-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6769; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6770; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6771; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
6772; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
6773; AVX512F-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6774; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6775; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
6776; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6777; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6778; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
6779; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
6780; AVX512F-SLOW-NEXT:    vzeroupper
6781; AVX512F-SLOW-NEXT:    retq
6782;
6783; AVX512F-FAST-LABEL: vec512_v32i16_to_v4i128_factor8:
6784; AVX512F-FAST:       # %bb.0:
6785; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
6786; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6787; AVX512F-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6788; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6789; AVX512F-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6790; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
6791; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
6792; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6793; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
6794; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6795; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6796; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
6797; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
6798; AVX512F-FAST-NEXT:    vzeroupper
6799; AVX512F-FAST-NEXT:    retq
6800;
6801; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8:
6802; AVX512BW:       # %bb.0:
6803; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6804; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6805; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15]
6806; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6807; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm2
6808; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
6809; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6810; AVX512BW-NEXT:    vzeroupper
6811; AVX512BW-NEXT:    retq
6812  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6813  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6814  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6815  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6816  %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 1, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 2, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 3, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6817  %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6818  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6819  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6820  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6821  ret void
6822}
6823
6824define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6825; SSE2-LABEL: vec512_v32i16_to_v2i256_factor16:
6826; SSE2:       # %bb.0:
6827; SSE2-NEXT:    movdqa (%rdi), %xmm0
6828; SSE2-NEXT:    paddb (%rsi), %xmm0
6829; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
6830; SSE2-NEXT:    pand %xmm0, %xmm1
6831; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6832; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6833; SSE2-NEXT:    movaps 16(%rdx), %xmm2
6834; SSE2-NEXT:    movaps 48(%rdx), %xmm3
6835; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6836; SSE2-NEXT:    paddb (%rdx), %xmm1
6837; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
6838; SSE2-NEXT:    movaps %xmm2, 16(%rcx)
6839; SSE2-NEXT:    movdqa %xmm1, (%rcx)
6840; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6841; SSE2-NEXT:    retq
6842;
6843; SSE42-LABEL: vec512_v32i16_to_v2i256_factor16:
6844; SSE42:       # %bb.0:
6845; SSE42-NEXT:    movdqa (%rdi), %xmm0
6846; SSE42-NEXT:    paddb (%rsi), %xmm0
6847; SSE42-NEXT:    pxor %xmm1, %xmm1
6848; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6849; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6850; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6851; SSE42-NEXT:    movaps 16(%rdx), %xmm2
6852; SSE42-NEXT:    movaps 48(%rdx), %xmm3
6853; SSE42-NEXT:    paddb 32(%rdx), %xmm0
6854; SSE42-NEXT:    paddb (%rdx), %xmm1
6855; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
6856; SSE42-NEXT:    movaps %xmm2, 16(%rcx)
6857; SSE42-NEXT:    movdqa %xmm1, (%rcx)
6858; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
6859; SSE42-NEXT:    retq
6860;
6861; AVX-LABEL: vec512_v32i16_to_v2i256_factor16:
6862; AVX:       # %bb.0:
6863; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6864; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6865; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6866; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6867; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
6868; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6869; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
6870; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6871; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6872; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
6873; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6874; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
6875; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6876; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
6877; AVX-NEXT:    retq
6878;
6879; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16:
6880; AVX2:       # %bb.0:
6881; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6882; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6883; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6884; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm1
6885; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6886; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6887; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6888; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
6889; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
6890; AVX2-NEXT:    vzeroupper
6891; AVX2-NEXT:    retq
6892;
6893; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16:
6894; AVX512F:       # %bb.0:
6895; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6896; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6897; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6898; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm1
6899; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6900; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6901; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
6902; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
6903; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
6904; AVX512F-NEXT:    vzeroupper
6905; AVX512F-NEXT:    retq
6906;
6907; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16:
6908; AVX512BW:       # %bb.0:
6909; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
6910; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6911; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6912; AVX512BW-NEXT:    vpand %ymm0, %ymm1, %ymm1
6913; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6914; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
6915; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6916; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6917; AVX512BW-NEXT:    vzeroupper
6918; AVX512BW-NEXT:    retq
6919  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6920  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6921  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6922  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6923  %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 1, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6924  %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6925  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6926  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6927  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6928  ret void
6929}
6930
6931define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6932; SSE2-LABEL: vec512_v32i16_to_v1i512_factor32:
6933; SSE2:       # %bb.0:
6934; SSE2-NEXT:    movdqa (%rdi), %xmm0
6935; SSE2-NEXT:    paddb (%rsi), %xmm0
6936; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6937; SSE2-NEXT:    movaps 16(%rdx), %xmm1
6938; SSE2-NEXT:    movaps 32(%rdx), %xmm2
6939; SSE2-NEXT:    movaps 48(%rdx), %xmm3
6940; SSE2-NEXT:    paddb (%rdx), %xmm0
6941; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
6942; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
6943; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
6944; SSE2-NEXT:    movdqa %xmm0, (%rcx)
6945; SSE2-NEXT:    retq
6946;
6947; SSE42-LABEL: vec512_v32i16_to_v1i512_factor32:
6948; SSE42:       # %bb.0:
6949; SSE42-NEXT:    movdqa (%rdi), %xmm0
6950; SSE42-NEXT:    paddb (%rsi), %xmm0
6951; SSE42-NEXT:    pxor %xmm1, %xmm1
6952; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6953; SSE42-NEXT:    movaps 16(%rdx), %xmm0
6954; SSE42-NEXT:    movaps 32(%rdx), %xmm2
6955; SSE42-NEXT:    movaps 48(%rdx), %xmm3
6956; SSE42-NEXT:    paddb (%rdx), %xmm1
6957; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
6958; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
6959; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
6960; SSE42-NEXT:    movdqa %xmm1, (%rcx)
6961; SSE42-NEXT:    retq
6962;
6963; AVX-LABEL: vec512_v32i16_to_v1i512_factor32:
6964; AVX:       # %bb.0:
6965; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6966; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6967; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
6968; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6969; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
6970; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6971; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6972; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6973; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
6974; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6975; AVX-NEXT:    vzeroupper
6976; AVX-NEXT:    retq
6977;
6978; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32:
6979; AVX2:       # %bb.0:
6980; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6981; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6982; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6983; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
6984; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
6985; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6986; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
6987; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6988; AVX2-NEXT:    vzeroupper
6989; AVX2-NEXT:    retq
6990;
6991; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32:
6992; AVX512F:       # %bb.0:
6993; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6994; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6995; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6996; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
6997; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6998; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
6999; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
7000; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7001; AVX512F-NEXT:    vzeroupper
7002; AVX512F-NEXT:    retq
7003;
7004; AVX512BW-LABEL: vec512_v32i16_to_v1i512_factor32:
7005; AVX512BW:       # %bb.0:
7006; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
7007; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
7008; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7009; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7010; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7011; AVX512BW-NEXT:    vzeroupper
7012; AVX512BW-NEXT:    retq
7013  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7014  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7015  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7016  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
7017  %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
7018  %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
7019  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7020  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7021  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7022  ret void
7023}
7024
7025define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7026; SSE2-LABEL: vec512_v16i32_to_v8i64_factor2:
7027; SSE2:       # %bb.0:
7028; SSE2-NEXT:    movdqa (%rdi), %xmm0
7029; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
7030; SSE2-NEXT:    paddb (%rsi), %xmm0
7031; SSE2-NEXT:    paddb 16(%rsi), %xmm1
7032; SSE2-NEXT:    pxor %xmm2, %xmm2
7033; SSE2-NEXT:    movdqa %xmm1, %xmm3
7034; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7035; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
7036; SSE2-NEXT:    movdqa %xmm0, %xmm4
7037; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
7038; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7039; SSE2-NEXT:    paddb 16(%rdx), %xmm0
7040; SSE2-NEXT:    paddb (%rdx), %xmm4
7041; SSE2-NEXT:    paddb 48(%rdx), %xmm1
7042; SSE2-NEXT:    paddb 32(%rdx), %xmm3
7043; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
7044; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
7045; SSE2-NEXT:    movdqa %xmm4, (%rcx)
7046; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
7047; SSE2-NEXT:    retq
7048;
7049; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2:
7050; SSE42:       # %bb.0:
7051; SSE42-NEXT:    movdqa (%rdi), %xmm0
7052; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
7053; SSE42-NEXT:    paddb (%rsi), %xmm0
7054; SSE42-NEXT:    paddb 16(%rsi), %xmm1
7055; SSE42-NEXT:    pxor %xmm2, %xmm2
7056; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
7057; SSE42-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
7058; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
7059; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7060; SSE42-NEXT:    paddb 16(%rdx), %xmm0
7061; SSE42-NEXT:    paddb (%rdx), %xmm4
7062; SSE42-NEXT:    paddb 48(%rdx), %xmm1
7063; SSE42-NEXT:    paddb 32(%rdx), %xmm3
7064; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
7065; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
7066; SSE42-NEXT:    movdqa %xmm4, (%rcx)
7067; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
7068; SSE42-NEXT:    retq
7069;
7070; AVX-LABEL: vec512_v16i32_to_v8i64_factor2:
7071; AVX:       # %bb.0:
7072; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7073; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
7074; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
7075; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7076; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
7077; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
7078; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
7079; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
7080; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
7081; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
7082; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, %xmm3
7083; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
7084; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
7085; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
7086; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
7087; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
7088; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
7089; AVX-NEXT:    retq
7090;
7091; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:
7092; AVX2:       # %bb.0:
7093; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7094; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7095; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
7096; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
7097; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
7098; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7099; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
7100; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
7101; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
7102; AVX2-NEXT:    vzeroupper
7103; AVX2-NEXT:    retq
7104;
7105; AVX512F-LABEL: vec512_v16i32_to_v8i64_factor2:
7106; AVX512F:       # %bb.0:
7107; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7108; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7109; AVX512F-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
7110; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7111; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7112; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7113; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7114; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
7115; AVX512F-NEXT:    vzeroupper
7116; AVX512F-NEXT:    retq
7117;
7118; AVX512BW-LABEL: vec512_v16i32_to_v8i64_factor2:
7119; AVX512BW:       # %bb.0:
7120; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
7121; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7122; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
7123; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7124; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7125; AVX512BW-NEXT:    vzeroupper
7126; AVX512BW-NEXT:    retq
7127  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7128  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7129  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7130  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7131  %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
7132  %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7133  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7134  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7135  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7136  ret void
7137}
7138
7139define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7140; SSE2-LABEL: vec512_v16i32_to_v4i128_factor4:
7141; SSE2:       # %bb.0:
7142; SSE2-NEXT:    movdqa (%rdi), %xmm0
7143; SSE2-NEXT:    paddb (%rsi), %xmm0
7144; SSE2-NEXT:    xorps %xmm1, %xmm1
7145; SSE2-NEXT:    movdqa %xmm0, %xmm2
7146; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7147; SSE2-NEXT:    xorps %xmm3, %xmm3
7148; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
7149; SSE2-NEXT:    movdqa %xmm0, %xmm4
7150; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7151; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3]
7152; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
7153; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
7154; SSE2-NEXT:    paddb 16(%rdx), %xmm0
7155; SSE2-NEXT:    paddb 32(%rdx), %xmm4
7156; SSE2-NEXT:    paddb (%rdx), %xmm3
7157; SSE2-NEXT:    paddb 48(%rdx), %xmm2
7158; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
7159; SSE2-NEXT:    movdqa %xmm3, (%rcx)
7160; SSE2-NEXT:    movdqa %xmm4, 32(%rcx)
7161; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
7162; SSE2-NEXT:    retq
7163;
7164; SSE42-LABEL: vec512_v16i32_to_v4i128_factor4:
7165; SSE42:       # %bb.0:
7166; SSE42-NEXT:    movdqa (%rdi), %xmm0
7167; SSE42-NEXT:    paddb (%rsi), %xmm0
7168; SSE42-NEXT:    pxor %xmm1, %xmm1
7169; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7170; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
7171; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
7172; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7173; SSE42-NEXT:    pxor %xmm4, %xmm4
7174; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
7175; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
7176; SSE42-NEXT:    paddb 16(%rdx), %xmm3
7177; SSE42-NEXT:    paddb 32(%rdx), %xmm2
7178; SSE42-NEXT:    paddb (%rdx), %xmm1
7179; SSE42-NEXT:    paddb 48(%rdx), %xmm0
7180; SSE42-NEXT:    movdqa %xmm0, 48(%rcx)
7181; SSE42-NEXT:    movdqa %xmm1, (%rcx)
7182; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
7183; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
7184; SSE42-NEXT:    retq
7185;
7186; AVX-LABEL: vec512_v16i32_to_v4i128_factor4:
7187; AVX:       # %bb.0:
7188; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7189; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7190; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
7191; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
7192; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
7193; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
7194; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
7195; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
7196; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
7197; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
7198; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
7199; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
7200; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
7201; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
7202; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
7203; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
7204; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
7205; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
7206; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
7207; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
7208; AVX-NEXT:    vzeroupper
7209; AVX-NEXT:    retq
7210;
7211; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4:
7212; AVX2-SLOW:       # %bb.0:
7213; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
7214; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
7215; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
7216; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
7217; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
7218; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
7219; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
7220; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
7221; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
7222; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7223; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7224; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
7225; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
7226; AVX2-SLOW-NEXT:    vzeroupper
7227; AVX2-SLOW-NEXT:    retq
7228;
7229; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4:
7230; AVX2-FAST-PERLANE:       # %bb.0:
7231; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
7232; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
7233; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
7234; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
7235; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
7236; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
7237; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
7238; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
7239; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
7240; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7241; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7242; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
7243; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
7244; AVX2-FAST-PERLANE-NEXT:    vzeroupper
7245; AVX2-FAST-PERLANE-NEXT:    retq
7246;
7247; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4:
7248; AVX2-FAST:       # %bb.0:
7249; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
7250; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7251; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7252; AVX2-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0]
7253; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
7254; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
7255; AVX2-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,0,3,0]
7256; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
7257; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
7258; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7259; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7260; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
7261; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
7262; AVX2-FAST-NEXT:    vzeroupper
7263; AVX2-FAST-NEXT:    retq
7264;
7265; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4:
7266; AVX512F:       # %bb.0:
7267; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7268; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7269; AVX512F-NEXT:    movw $4369, %ax # imm = 0x1111
7270; AVX512F-NEXT:    kmovw %eax, %k1
7271; AVX512F-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
7272; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7273; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7274; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7275; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7276; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
7277; AVX512F-NEXT:    vzeroupper
7278; AVX512F-NEXT:    retq
7279;
7280; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4:
7281; AVX512BW:       # %bb.0:
7282; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
7283; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7284; AVX512BW-NEXT:    movb $17, %al
7285; AVX512BW-NEXT:    kmovd %eax, %k1
7286; AVX512BW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
7287; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7288; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15]
7289; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
7290; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm0
7291; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7292; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7293; AVX512BW-NEXT:    vzeroupper
7294; AVX512BW-NEXT:    retq
7295  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7296  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7297  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7298  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7299  %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
7300  %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7301  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7302  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7303  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7304  ret void
7305}
7306
7307define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7308; SSE2-LABEL: vec512_v16i32_to_v2i256_factor8:
7309; SSE2:       # %bb.0:
7310; SSE2-NEXT:    movdqa (%rdi), %xmm0
7311; SSE2-NEXT:    paddb (%rsi), %xmm0
7312; SSE2-NEXT:    xorps %xmm1, %xmm1
7313; SSE2-NEXT:    xorps %xmm2, %xmm2
7314; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
7315; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
7316; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
7317; SSE2-NEXT:    movaps 16(%rdx), %xmm1
7318; SSE2-NEXT:    movaps 48(%rdx), %xmm3
7319; SSE2-NEXT:    paddb 32(%rdx), %xmm0
7320; SSE2-NEXT:    paddb (%rdx), %xmm2
7321; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
7322; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
7323; SSE2-NEXT:    movdqa %xmm2, (%rcx)
7324; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
7325; SSE2-NEXT:    retq
7326;
7327; SSE42-LABEL: vec512_v16i32_to_v2i256_factor8:
7328; SSE42:       # %bb.0:
7329; SSE42-NEXT:    movdqa (%rdi), %xmm0
7330; SSE42-NEXT:    paddb (%rsi), %xmm0
7331; SSE42-NEXT:    pxor %xmm1, %xmm1
7332; SSE42-NEXT:    pxor %xmm2, %xmm2
7333; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
7334; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7335; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7336; SSE42-NEXT:    movaps 16(%rdx), %xmm1
7337; SSE42-NEXT:    movaps 48(%rdx), %xmm3
7338; SSE42-NEXT:    paddb 32(%rdx), %xmm0
7339; SSE42-NEXT:    paddb (%rdx), %xmm2
7340; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
7341; SSE42-NEXT:    movaps %xmm1, 16(%rcx)
7342; SSE42-NEXT:    movdqa %xmm2, (%rcx)
7343; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
7344; SSE42-NEXT:    retq
7345;
7346; AVX-LABEL: vec512_v16i32_to_v2i256_factor8:
7347; AVX:       # %bb.0:
7348; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7349; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7350; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7351; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7352; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
7353; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
7354; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
7355; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7356; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7357; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
7358; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7359; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
7360; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7361; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
7362; AVX-NEXT:    retq
7363;
7364; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8:
7365; AVX2-SLOW:       # %bb.0:
7366; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
7367; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7368; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7369; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7370; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7371; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7372; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7373; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7374; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
7375; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
7376; AVX2-SLOW-NEXT:    vzeroupper
7377; AVX2-SLOW-NEXT:    retq
7378;
7379; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v2i256_factor8:
7380; AVX2-FAST-PERLANE:       # %bb.0:
7381; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
7382; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7383; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7384; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7385; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7386; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7387; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
7388; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
7389; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
7390; AVX2-FAST-PERLANE-NEXT:    vzeroupper
7391; AVX2-FAST-PERLANE-NEXT:    retq
7392;
7393; AVX2-FAST-LABEL: vec512_v16i32_to_v2i256_factor8:
7394; AVX2-FAST:       # %bb.0:
7395; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
7396; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7397; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7398; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7399; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7400; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7401; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
7402; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
7403; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
7404; AVX2-FAST-NEXT:    vzeroupper
7405; AVX2-FAST-NEXT:    retq
7406;
7407; AVX512F-LABEL: vec512_v16i32_to_v2i256_factor8:
7408; AVX512F:       # %bb.0:
7409; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7410; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7411; AVX512F-NEXT:    movw $257, %ax # imm = 0x101
7412; AVX512F-NEXT:    kmovw %eax, %k1
7413; AVX512F-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
7414; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7415; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7416; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7417; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7418; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
7419; AVX512F-NEXT:    vzeroupper
7420; AVX512F-NEXT:    retq
7421;
7422; AVX512BW-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8:
7423; AVX512BW-SLOW:       # %bb.0:
7424; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
7425; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7426; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7427; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7428; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7429; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7430; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
7431; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7432; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7433; AVX512BW-SLOW-NEXT:    vzeroupper
7434; AVX512BW-SLOW-NEXT:    retq
7435;
7436; AVX512BW-FAST-LABEL: vec512_v16i32_to_v2i256_factor8:
7437; AVX512BW-FAST:       # %bb.0:
7438; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
7439; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7440; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7441; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7442; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7443; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7444; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7445; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
7446; AVX512BW-FAST-NEXT:    vzeroupper
7447; AVX512BW-FAST-NEXT:    retq
7448  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7449  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7450  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7451  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7452  %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
7453  %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7454  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7455  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7456  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7457  ret void
7458}
7459
7460define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7461; SSE2-LABEL: vec512_v16i32_to_v1i512_factor16:
7462; SSE2:       # %bb.0:
7463; SSE2-NEXT:    movdqa (%rdi), %xmm0
7464; SSE2-NEXT:    paddb (%rsi), %xmm0
7465; SSE2-NEXT:    xorps %xmm1, %xmm1
7466; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
7467; SSE2-NEXT:    movaps 16(%rdx), %xmm0
7468; SSE2-NEXT:    movaps 32(%rdx), %xmm2
7469; SSE2-NEXT:    movaps 48(%rdx), %xmm3
7470; SSE2-NEXT:    paddb (%rdx), %xmm1
7471; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
7472; SSE2-NEXT:    movaps %xmm3, 48(%rcx)
7473; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
7474; SSE2-NEXT:    movdqa %xmm1, (%rcx)
7475; SSE2-NEXT:    retq
7476;
7477; SSE42-LABEL: vec512_v16i32_to_v1i512_factor16:
7478; SSE42:       # %bb.0:
7479; SSE42-NEXT:    movdqa (%rdi), %xmm0
7480; SSE42-NEXT:    paddb (%rsi), %xmm0
7481; SSE42-NEXT:    pxor %xmm1, %xmm1
7482; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7483; SSE42-NEXT:    movaps 16(%rdx), %xmm0
7484; SSE42-NEXT:    movaps 32(%rdx), %xmm2
7485; SSE42-NEXT:    movaps 48(%rdx), %xmm3
7486; SSE42-NEXT:    paddb (%rdx), %xmm1
7487; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
7488; SSE42-NEXT:    movaps %xmm3, 48(%rcx)
7489; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
7490; SSE42-NEXT:    movdqa %xmm1, (%rcx)
7491; SSE42-NEXT:    retq
7492;
7493; AVX-LABEL: vec512_v16i32_to_v1i512_factor16:
7494; AVX:       # %bb.0:
7495; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7496; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7497; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
7498; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7499; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
7500; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7501; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7502; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7503; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
7504; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7505; AVX-NEXT:    vzeroupper
7506; AVX-NEXT:    retq
7507;
7508; AVX2-LABEL: vec512_v16i32_to_v1i512_factor16:
7509; AVX2:       # %bb.0:
7510; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7511; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7512; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7513; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7514; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
7515; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7516; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
7517; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
7518; AVX2-NEXT:    vzeroupper
7519; AVX2-NEXT:    retq
7520;
7521; AVX512F-LABEL: vec512_v16i32_to_v1i512_factor16:
7522; AVX512F:       # %bb.0:
7523; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7524; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7525; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7526; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7527; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7528; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
7529; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
7530; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7531; AVX512F-NEXT:    vzeroupper
7532; AVX512F-NEXT:    retq
7533;
7534; AVX512BW-LABEL: vec512_v16i32_to_v1i512_factor16:
7535; AVX512BW:       # %bb.0:
7536; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
7537; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
7538; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7539; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7540; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7541; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7542; AVX512BW-NEXT:    vzeroupper
7543; AVX512BW-NEXT:    retq
7544  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7545  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7546  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7547  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7548  %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
7549  %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7550  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7551  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7552  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7553  ret void
7554}
7555
7556define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7557; SSE-LABEL: vec512_v8i64_to_v4i128_factor2:
7558; SSE:       # %bb.0:
7559; SSE-NEXT:    movdqa (%rdi), %xmm0
7560; SSE-NEXT:    movdqa 16(%rdi), %xmm1
7561; SSE-NEXT:    paddb (%rsi), %xmm0
7562; SSE-NEXT:    paddb 16(%rsi), %xmm1
7563; SSE-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
7564; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7565; SSE-NEXT:    movq {{.*#+}} xmm3 = xmm0[0],zero
7566; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7567; SSE-NEXT:    paddb 16(%rdx), %xmm0
7568; SSE-NEXT:    paddb (%rdx), %xmm3
7569; SSE-NEXT:    paddb 48(%rdx), %xmm1
7570; SSE-NEXT:    paddb 32(%rdx), %xmm2
7571; SSE-NEXT:    movdqa %xmm2, 32(%rcx)
7572; SSE-NEXT:    movdqa %xmm1, 48(%rcx)
7573; SSE-NEXT:    movdqa %xmm3, (%rcx)
7574; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
7575; SSE-NEXT:    retq
7576;
7577; AVX-LABEL: vec512_v8i64_to_v4i128_factor2:
7578; AVX:       # %bb.0:
7579; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7580; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
7581; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
7582; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7583; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
7584; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
7585; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
7586; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
7587; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3]
7588; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
7589; AVX-NEXT:    vpaddb 48(%rdx), %xmm2, %xmm2
7590; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
7591; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
7592; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
7593; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7594; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7595; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
7596; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
7597; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
7598; AVX-NEXT:    vzeroupper
7599; AVX-NEXT:    retq
7600;
7601; AVX2-LABEL: vec512_v8i64_to_v4i128_factor2:
7602; AVX2:       # %bb.0:
7603; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7604; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7605; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7606; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
7607; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
7608; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
7609; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
7610; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7611; AVX2-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7612; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
7613; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
7614; AVX2-NEXT:    vzeroupper
7615; AVX2-NEXT:    retq
7616;
7617; AVX512F-LABEL: vec512_v8i64_to_v4i128_factor2:
7618; AVX512F:       # %bb.0:
7619; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7620; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7621; AVX512F-NEXT:    movb $85, %al
7622; AVX512F-NEXT:    kmovw %eax, %k1
7623; AVX512F-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
7624; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7625; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7626; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7627; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7628; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
7629; AVX512F-NEXT:    vzeroupper
7630; AVX512F-NEXT:    retq
7631;
7632; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2:
7633; AVX512BW-SLOW:       # %bb.0:
7634; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
7635; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7636; AVX512BW-SLOW-NEXT:    movb $5, %al
7637; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
7638; AVX512BW-SLOW-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
7639; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
7640; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7641; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
7642; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7643; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7644; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7645; AVX512BW-SLOW-NEXT:    vzeroupper
7646; AVX512BW-SLOW-NEXT:    retq
7647;
7648; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2:
7649; AVX512BW-FAST:       # %bb.0:
7650; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
7651; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7652; AVX512BW-FAST-NEXT:    movb $5, %al
7653; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
7654; AVX512BW-FAST-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
7655; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7656; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,5,3,7]
7657; AVX512BW-FAST-NEXT:    vpermi2q %ymm2, %ymm0, %ymm3
7658; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm0
7659; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7660; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
7661; AVX512BW-FAST-NEXT:    vzeroupper
7662; AVX512BW-FAST-NEXT:    retq
7663  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7664  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7665  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7666  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7667  %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
7668  %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8>
7669  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7670  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7671  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7672  ret void
7673}
7674
7675define void @vec512_v8i64_to_v2i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7676; SSE-LABEL: vec512_v8i64_to_v2i256_factor4:
7677; SSE:       # %bb.0:
7678; SSE-NEXT:    movdqa (%rdi), %xmm0
7679; SSE-NEXT:    paddb (%rsi), %xmm0
7680; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
7681; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7682; SSE-NEXT:    movaps 16(%rdx), %xmm2
7683; SSE-NEXT:    movaps 48(%rdx), %xmm3
7684; SSE-NEXT:    paddb (%rdx), %xmm1
7685; SSE-NEXT:    paddb 32(%rdx), %xmm0
7686; SSE-NEXT:    movaps %xmm3, 48(%rcx)
7687; SSE-NEXT:    movaps %xmm2, 16(%rcx)
7688; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
7689; SSE-NEXT:    movdqa %xmm1, (%rcx)
7690; SSE-NEXT:    retq
7691;
7692; AVX-LABEL: vec512_v8i64_to_v2i256_factor4:
7693; AVX:       # %bb.0:
7694; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7695; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7696; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7697; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
7698; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7699; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7700; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7701; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
7702; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7703; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
7704; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7705; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
7706; AVX-NEXT:    retq
7707;
7708; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4:
7709; AVX2:       # %bb.0:
7710; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7711; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7712; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = xmm0[0],zero
7713; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7714; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7715; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
7716; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
7717; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
7718; AVX2-NEXT:    vzeroupper
7719; AVX2-NEXT:    retq
7720;
7721; AVX512F-LABEL: vec512_v8i64_to_v2i256_factor4:
7722; AVX512F:       # %bb.0:
7723; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7724; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7725; AVX512F-NEXT:    movb $17, %al
7726; AVX512F-NEXT:    kmovw %eax, %k1
7727; AVX512F-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
7728; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7729; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7730; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7731; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7732; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
7733; AVX512F-NEXT:    vzeroupper
7734; AVX512F-NEXT:    retq
7735;
7736; AVX512BW-LABEL: vec512_v8i64_to_v2i256_factor4:
7737; AVX512BW:       # %bb.0:
7738; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
7739; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7740; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = xmm0[0],zero
7741; AVX512BW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7742; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7743; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7744; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7745; AVX512BW-NEXT:    vzeroupper
7746; AVX512BW-NEXT:    retq
7747  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7748  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7749  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7750  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7751  %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
7752  %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8>
7753  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7754  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7755  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7756  ret void
7757}
7758
7759define void @vec512_v8i64_to_v1i512_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7760; SSE-LABEL: vec512_v8i64_to_v1i512_factor8:
7761; SSE:       # %bb.0:
7762; SSE-NEXT:    movdqa (%rdi), %xmm0
7763; SSE-NEXT:    paddb (%rsi), %xmm0
7764; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
7765; SSE-NEXT:    movaps 16(%rdx), %xmm1
7766; SSE-NEXT:    movaps 32(%rdx), %xmm2
7767; SSE-NEXT:    movaps 48(%rdx), %xmm3
7768; SSE-NEXT:    paddb (%rdx), %xmm0
7769; SSE-NEXT:    movaps %xmm2, 32(%rcx)
7770; SSE-NEXT:    movaps %xmm3, 48(%rcx)
7771; SSE-NEXT:    movaps %xmm1, 16(%rcx)
7772; SSE-NEXT:    movdqa %xmm0, (%rcx)
7773; SSE-NEXT:    retq
7774;
7775; AVX-LABEL: vec512_v8i64_to_v1i512_factor8:
7776; AVX:       # %bb.0:
7777; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7778; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7779; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
7780; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7781; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7782; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7783; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7784; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
7785; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7786; AVX-NEXT:    vzeroupper
7787; AVX-NEXT:    retq
7788;
7789; AVX2-LABEL: vec512_v8i64_to_v1i512_factor8:
7790; AVX2:       # %bb.0:
7791; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7792; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7793; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7794; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
7795; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7796; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
7797; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
7798; AVX2-NEXT:    vzeroupper
7799; AVX2-NEXT:    retq
7800;
7801; AVX512F-LABEL: vec512_v8i64_to_v1i512_factor8:
7802; AVX512F:       # %bb.0:
7803; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7804; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7805; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7806; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7807; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
7808; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
7809; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7810; AVX512F-NEXT:    vzeroupper
7811; AVX512F-NEXT:    retq
7812;
7813; AVX512BW-LABEL: vec512_v8i64_to_v1i512_factor8:
7814; AVX512BW:       # %bb.0:
7815; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
7816; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
7817; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7818; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7819; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7820; AVX512BW-NEXT:    vzeroupper
7821; AVX512BW-NEXT:    retq
7822  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7823  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7824  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7825  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7826  %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7827  %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8>
7828  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7829  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7830  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7831  ret void
7832}
7833
7834define void @vec512_v4i128_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7835; SSE-LABEL: vec512_v4i128_to_v2i256_factor2:
7836; SSE:       # %bb.0:
7837; SSE-NEXT:    movdqa (%rdi), %xmm0
7838; SSE-NEXT:    movdqa 16(%rdi), %xmm1
7839; SSE-NEXT:    paddb 16(%rsi), %xmm1
7840; SSE-NEXT:    paddb (%rsi), %xmm0
7841; SSE-NEXT:    movaps 16(%rdx), %xmm2
7842; SSE-NEXT:    movaps 48(%rdx), %xmm3
7843; SSE-NEXT:    paddb (%rdx), %xmm0
7844; SSE-NEXT:    paddb 32(%rdx), %xmm1
7845; SSE-NEXT:    movaps %xmm3, 48(%rcx)
7846; SSE-NEXT:    movaps %xmm2, 16(%rcx)
7847; SSE-NEXT:    movdqa %xmm1, 32(%rcx)
7848; SSE-NEXT:    movdqa %xmm0, (%rcx)
7849; SSE-NEXT:    retq
7850;
7851; AVX-LABEL: vec512_v4i128_to_v2i256_factor2:
7852; AVX:       # %bb.0:
7853; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7854; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
7855; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7856; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
7857; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
7858; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7859; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7860; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
7861; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
7862; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7863; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7864; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
7865; AVX-NEXT:    retq
7866;
7867; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2:
7868; AVX2:       # %bb.0:
7869; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7870; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7871; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
7872; AVX2-NEXT:    vmovdqa %xmm0, %xmm0
7873; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7874; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7875; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
7876; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
7877; AVX2-NEXT:    vzeroupper
7878; AVX2-NEXT:    retq
7879;
7880; AVX512F-LABEL: vec512_v4i128_to_v2i256_factor2:
7881; AVX512F:       # %bb.0:
7882; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7883; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7884; AVX512F-NEXT:    movb $51, %al
7885; AVX512F-NEXT:    kmovw %eax, %k1
7886; AVX512F-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
7887; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7888; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
7889; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7890; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7891; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
7892; AVX512F-NEXT:    vzeroupper
7893; AVX512F-NEXT:    retq
7894;
7895; AVX512BW-LABEL: vec512_v4i128_to_v2i256_factor2:
7896; AVX512BW:       # %bb.0:
7897; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
7898; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7899; AVX512BW-NEXT:    vmovdqa %xmm0, %xmm1
7900; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
7901; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7902; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7903; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7904; AVX512BW-NEXT:    vzeroupper
7905; AVX512BW-NEXT:    retq
7906  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7907  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7908  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7909  %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
7910  %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
7911  %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8>
7912  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7913  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7914  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7915  ret void
7916}
7917
7918define void @vec512_v4i128_to_v1i512_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7919; SSE-LABEL: vec512_v4i128_to_v1i512_factor4:
7920; SSE:       # %bb.0:
7921; SSE-NEXT:    movdqa (%rdi), %xmm0
7922; SSE-NEXT:    paddb (%rsi), %xmm0
7923; SSE-NEXT:    movaps 16(%rdx), %xmm1
7924; SSE-NEXT:    movaps 32(%rdx), %xmm2
7925; SSE-NEXT:    movaps 48(%rdx), %xmm3
7926; SSE-NEXT:    paddb (%rdx), %xmm0
7927; SSE-NEXT:    movaps %xmm2, 32(%rcx)
7928; SSE-NEXT:    movaps %xmm3, 48(%rcx)
7929; SSE-NEXT:    movaps %xmm1, 16(%rcx)
7930; SSE-NEXT:    movdqa %xmm0, (%rcx)
7931; SSE-NEXT:    retq
7932;
7933; AVX-LABEL: vec512_v4i128_to_v1i512_factor4:
7934; AVX:       # %bb.0:
7935; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7936; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7937; AVX-NEXT:    vmovaps 32(%rdx), %ymm1
7938; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7939; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7940; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7941; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
7942; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7943; AVX-NEXT:    vzeroupper
7944; AVX-NEXT:    retq
7945;
7946; AVX2-LABEL: vec512_v4i128_to_v1i512_factor4:
7947; AVX2:       # %bb.0:
7948; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
7949; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7950; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
7951; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7952; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
7953; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
7954; AVX2-NEXT:    vzeroupper
7955; AVX2-NEXT:    retq
7956;
7957; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4:
7958; AVX512F:       # %bb.0:
7959; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
7960; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7961; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7962; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
7963; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
7964; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
7965; AVX512F-NEXT:    vzeroupper
7966; AVX512F-NEXT:    retq
7967;
7968; AVX512BW-LABEL: vec512_v4i128_to_v1i512_factor4:
7969; AVX512BW:       # %bb.0:
7970; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
7971; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7972; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7973; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7974; AVX512BW-NEXT:    vzeroupper
7975; AVX512BW-NEXT:    retq
7976  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7977  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7978  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7979  %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
7980  %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
7981  %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8>
7982  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7983  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7984  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7985  ret void
7986}
7987
7988define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7989; SSE-LABEL: vec512_v2i256_to_v1i512_factor2:
7990; SSE:       # %bb.0:
7991; SSE-NEXT:    movdqa (%rdi), %xmm0
7992; SSE-NEXT:    movdqa 16(%rdi), %xmm1
7993; SSE-NEXT:    paddb (%rsi), %xmm0
7994; SSE-NEXT:    paddb 16(%rsi), %xmm1
7995; SSE-NEXT:    movaps 32(%rdx), %xmm2
7996; SSE-NEXT:    movaps 48(%rdx), %xmm3
7997; SSE-NEXT:    paddb 16(%rdx), %xmm1
7998; SSE-NEXT:    paddb (%rdx), %xmm0
7999; SSE-NEXT:    movaps %xmm2, 32(%rcx)
8000; SSE-NEXT:    movaps %xmm3, 48(%rcx)
8001; SSE-NEXT:    movdqa %xmm0, (%rcx)
8002; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
8003; SSE-NEXT:    retq
8004;
8005; AVX-LABEL: vec512_v2i256_to_v1i512_factor2:
8006; AVX:       # %bb.0:
8007; AVX-NEXT:    vmovdqa (%rdi), %xmm0
8008; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
8009; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
8010; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
8011; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
8012; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
8013; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
8014; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
8015; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
8016; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
8017; AVX-NEXT:    vzeroupper
8018; AVX-NEXT:    retq
8019;
8020; AVX2-LABEL: vec512_v2i256_to_v1i512_factor2:
8021; AVX2:       # %bb.0:
8022; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
8023; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
8024; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
8025; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
8026; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
8027; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
8028; AVX2-NEXT:    vzeroupper
8029; AVX2-NEXT:    retq
8030;
8031; AVX512F-LABEL: vec512_v2i256_to_v1i512_factor2:
8032; AVX512F:       # %bb.0:
8033; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
8034; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
8035; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
8036; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
8037; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
8038; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
8039; AVX512F-NEXT:    vzeroupper
8040; AVX512F-NEXT:    retq
8041;
8042; AVX512BW-LABEL: vec512_v2i256_to_v1i512_factor2:
8043; AVX512BW:       # %bb.0:
8044; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
8045; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
8046; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
8047; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
8048; AVX512BW-NEXT:    vzeroupper
8049; AVX512BW-NEXT:    retq
8050  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
8051  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
8052  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
8053  %in.vec.cast = bitcast <64 x i8> %in.vec to <2 x i256>
8054  %zextd.vec = shufflevector <2 x i256> %in.vec.cast, <2 x i256> zeroinitializer, <2 x i32> <i32 0, i32 3>
8055  %out.bytevec = bitcast <2 x i256> %zextd.vec to <64 x i8>
8056  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
8057  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
8058  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
8059  ret void
8060}
8061;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
8062; AVX1-ONLY: {{.*}}
8063; FALLBACK0: {{.*}}
8064; FALLBACK1: {{.*}}
8065; FALLBACK2: {{.*}}
8066; FALLBACK3: {{.*}}
8067; FALLBACK4: {{.*}}
8068; FALLBACK5: {{.*}}
8069; FALLBACK6: {{.*}}
8070; FALLBACK7: {{.*}}
8071; FALLBACK8: {{.*}}
8072; FALLBACK9: {{.*}}
8073