xref: /llvm-project/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll (revision 3e4ee76fe0a015e306b58f0d8c1565f9f06ff9c3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12
15; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13
16
17define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
18; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
19; SSE2:       # %bb.0:
20; SSE2-NEXT:    movdqa (%rdi), %xmm0
21; SSE2-NEXT:    paddb (%rsi), %xmm0
22; SSE2-NEXT:    pxor %xmm1, %xmm1
23; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
25; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
26; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
27; SSE2-NEXT:    packuswb %xmm0, %xmm0
28; SSE2-NEXT:    paddb (%rdx), %xmm0
29; SSE2-NEXT:    movdqa %xmm0, (%rcx)
30; SSE2-NEXT:    retq
31;
32; SSE42-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
33; SSE42:       # %bb.0:
34; SSE42-NEXT:    movdqa (%rdi), %xmm0
35; SSE42-NEXT:    paddb (%rsi), %xmm0
36; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
37; SSE42-NEXT:    paddb (%rdx), %xmm0
38; SSE42-NEXT:    movdqa %xmm0, (%rcx)
39; SSE42-NEXT:    retq
40;
41; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
42; AVX:       # %bb.0:
43; AVX-NEXT:    vmovdqa (%rdi), %xmm0
44; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
45; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
46; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
47; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
48; AVX-NEXT:    retq
49;
50; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
51; AVX2:       # %bb.0:
52; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
53; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
54; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
55; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
56; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
57; AVX2-NEXT:    vzeroupper
58; AVX2-NEXT:    retq
59;
60; AVX512F-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
61; AVX512F:       # %bb.0:
62; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
63; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
64; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
65; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
66; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
67; AVX512F-NEXT:    vzeroupper
68; AVX512F-NEXT:    retq
69;
70; AVX512DQ-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
71; AVX512DQ:       # %bb.0:
72; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
73; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
74; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
75; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
76; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
77; AVX512DQ-NEXT:    vzeroupper
78; AVX512DQ-NEXT:    retq
79;
80; AVX512BW-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
81; AVX512BW:       # %bb.0:
82; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
83; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
84; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
85; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
86; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
87; AVX512BW-NEXT:    vzeroupper
88; AVX512BW-NEXT:    retq
89  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
90  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
91  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
92  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
93  %out.bytevec.padded = shufflevector <4 x i8> %broadcast.of.zextinreg, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
94  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
95  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
96  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
97  ret void
98}
99
100define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
101; SSE2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
102; SSE2:       # %bb.0:
103; SSE2-NEXT:    movdqa (%rdi), %xmm0
104; SSE2-NEXT:    paddb (%rsi), %xmm0
105; SSE2-NEXT:    pxor %xmm1, %xmm1
106; SSE2-NEXT:    movdqa %xmm0, %xmm2
107; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
108; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
109; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
110; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
111; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
112; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
113; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
114; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
115; SSE2-NEXT:    packuswb %xmm2, %xmm2
116; SSE2-NEXT:    paddb (%rdx), %xmm2
117; SSE2-NEXT:    movdqa %xmm2, (%rcx)
118; SSE2-NEXT:    retq
119;
120; SSE42-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
121; SSE42:       # %bb.0:
122; SSE42-NEXT:    movdqa (%rdi), %xmm0
123; SSE42-NEXT:    paddb (%rsi), %xmm0
124; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
125; SSE42-NEXT:    paddb (%rdx), %xmm0
126; SSE42-NEXT:    movdqa %xmm0, (%rcx)
127; SSE42-NEXT:    retq
128;
129; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
130; AVX:       # %bb.0:
131; AVX-NEXT:    vmovdqa (%rdi), %xmm0
132; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
133; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
134; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
135; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
136; AVX-NEXT:    retq
137;
138; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
139; AVX2:       # %bb.0:
140; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
141; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
142; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
143; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
144; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
145; AVX2-NEXT:    vzeroupper
146; AVX2-NEXT:    retq
147;
148; AVX512F-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
149; AVX512F:       # %bb.0:
150; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
151; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
152; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
153; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
154; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
155; AVX512F-NEXT:    vzeroupper
156; AVX512F-NEXT:    retq
157;
158; AVX512DQ-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
159; AVX512DQ:       # %bb.0:
160; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
161; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
162; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
163; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
164; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
165; AVX512DQ-NEXT:    vzeroupper
166; AVX512DQ-NEXT:    retq
167;
168; AVX512BW-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
169; AVX512BW:       # %bb.0:
170; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
171; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
172; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
173; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
174; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
175; AVX512BW-NEXT:    vzeroupper
176; AVX512BW-NEXT:    retq
177  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
178  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
179  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
180  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
181  %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.zextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
182  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
183  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
184  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
185  ret void
186}
187
188define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
189; SSE2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
190; SSE2:       # %bb.0:
191; SSE2-NEXT:    movdqa (%rdi), %xmm0
192; SSE2-NEXT:    paddb (%rsi), %xmm0
193; SSE2-NEXT:    pxor %xmm1, %xmm1
194; SSE2-NEXT:    movdqa %xmm0, %xmm2
195; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
196; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
197; SSE2-NEXT:    pand %xmm3, %xmm2
198; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
199; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
200; SSE2-NEXT:    pandn %xmm0, %xmm3
201; SSE2-NEXT:    por %xmm2, %xmm3
202; SSE2-NEXT:    packuswb %xmm3, %xmm3
203; SSE2-NEXT:    paddb (%rdx), %xmm3
204; SSE2-NEXT:    movdqa %xmm3, (%rcx)
205; SSE2-NEXT:    retq
206;
207; SSE42-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
208; SSE42:       # %bb.0:
209; SSE42-NEXT:    movdqa (%rdi), %xmm0
210; SSE42-NEXT:    paddb (%rsi), %xmm0
211; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
212; SSE42-NEXT:    paddb (%rdx), %xmm0
213; SSE42-NEXT:    movdqa %xmm0, (%rcx)
214; SSE42-NEXT:    retq
215;
216; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vmovdqa (%rdi), %xmm0
219; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
220; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
221; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
222; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
223; AVX-NEXT:    retq
224;
225; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
226; AVX2:       # %bb.0:
227; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
228; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
229; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
230; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
231; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
232; AVX2-NEXT:    vzeroupper
233; AVX2-NEXT:    retq
234;
235; AVX512F-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
236; AVX512F:       # %bb.0:
237; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
238; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
239; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
240; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
241; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
242; AVX512F-NEXT:    vzeroupper
243; AVX512F-NEXT:    retq
244;
245; AVX512DQ-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
246; AVX512DQ:       # %bb.0:
247; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
248; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
249; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
250; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
251; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
252; AVX512DQ-NEXT:    vzeroupper
253; AVX512DQ-NEXT:    retq
254;
255; AVX512BW-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
256; AVX512BW:       # %bb.0:
257; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
258; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
259; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
260; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
261; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
262; AVX512BW-NEXT:    vzeroupper
263; AVX512BW-NEXT:    retq
264  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
265  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
266  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
267  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
268  %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.zextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
269  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
270  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
271  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
272  ret void
273}
274
275define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
276; SSE2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
277; SSE2:       # %bb.0:
278; SSE2-NEXT:    movdqa (%rdi), %xmm0
279; SSE2-NEXT:    paddb (%rsi), %xmm0
280; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
281; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
282; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
283; SSE2-NEXT:    paddb (%rdx), %xmm0
284; SSE2-NEXT:    movdqa %xmm0, (%rcx)
285; SSE2-NEXT:    retq
286;
287; SSE42-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
288; SSE42:       # %bb.0:
289; SSE42-NEXT:    movdqa (%rdi), %xmm0
290; SSE42-NEXT:    paddb (%rsi), %xmm0
291; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
292; SSE42-NEXT:    paddb (%rdx), %xmm0
293; SSE42-NEXT:    movdqa %xmm0, (%rcx)
294; SSE42-NEXT:    retq
295;
296; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vmovdqa (%rdi), %xmm0
299; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
300; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
301; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
302; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
303; AVX-NEXT:    retq
304;
305; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
306; AVX2:       # %bb.0:
307; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
308; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
309; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
310; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
311; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
312; AVX2-NEXT:    vzeroupper
313; AVX2-NEXT:    retq
314;
315; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
316; AVX512F:       # %bb.0:
317; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
318; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
319; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
320; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
321; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
322; AVX512F-NEXT:    vzeroupper
323; AVX512F-NEXT:    retq
324;
325; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
326; AVX512DQ:       # %bb.0:
327; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
328; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
329; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
330; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
331; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
332; AVX512DQ-NEXT:    vzeroupper
333; AVX512DQ-NEXT:    retq
334;
335; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
336; AVX512BW:       # %bb.0:
337; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
338; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
339; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
340; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
341; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
342; AVX512BW-NEXT:    vzeroupper
343; AVX512BW-NEXT:    retq
344  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
345  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
346  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
347  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
348  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
349  %out.bytevec = bitcast <4 x i16> %broadcast.of.zextinreg to <8 x i8>
350  %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
351  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
352  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
353  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
354  ret void
355}
356
357define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
358; SSE2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
359; SSE2:       # %bb.0:
360; SSE2-NEXT:    movdqa (%rdi), %xmm0
361; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
362; SSE2-NEXT:    paddb (%rsi), %xmm0
363; SSE2-NEXT:    paddb 16(%rsi), %xmm1
364; SSE2-NEXT:    psrlw $8, %xmm1
365; SSE2-NEXT:    packuswb %xmm1, %xmm1
366; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
367; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
368; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
369; SSE2-NEXT:    paddb (%rdx), %xmm0
370; SSE2-NEXT:    movdqa %xmm0, (%rcx)
371; SSE2-NEXT:    retq
372;
373; SSE42-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
374; SSE42:       # %bb.0:
375; SSE42-NEXT:    movdqa (%rdi), %xmm0
376; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
377; SSE42-NEXT:    paddb (%rsi), %xmm0
378; SSE42-NEXT:    paddb 16(%rsi), %xmm1
379; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
380; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
382; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
383; SSE42-NEXT:    paddb (%rdx), %xmm0
384; SSE42-NEXT:    movdqa %xmm0, (%rcx)
385; SSE42-NEXT:    retq
386;
387; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
388; AVX:       # %bb.0:
389; AVX-NEXT:    vmovdqa (%rdi), %xmm0
390; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
391; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
392; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
393; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
394; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
395; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
396; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
397; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
398; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
399; AVX-NEXT:    retq
400;
401; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
402; AVX2:       # %bb.0:
403; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
404; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
405; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
406; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
407; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
408; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
409; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
410; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
411; AVX2-NEXT:    vzeroupper
412; AVX2-NEXT:    retq
413;
414; AVX512F-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
415; AVX512F:       # %bb.0:
416; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
417; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
418; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
419; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
420; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
421; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
422; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
423; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
424; AVX512F-NEXT:    vzeroupper
425; AVX512F-NEXT:    retq
426;
427; AVX512DQ-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
428; AVX512DQ:       # %bb.0:
429; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
430; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
431; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
432; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
433; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm0
434; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
435; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
436; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
437; AVX512DQ-NEXT:    vzeroupper
438; AVX512DQ-NEXT:    retq
439;
440; AVX512BW-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
441; AVX512BW:       # %bb.0:
442; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
443; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
444; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
445; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
446; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm0
447; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
448; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
449; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
450; AVX512BW-NEXT:    vzeroupper
451; AVX512BW-NEXT:    retq
452  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
453  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
454  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
455  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
456  %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
457  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
458  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
459  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
460  ret void
461}
462
463define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
464; SSE2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
465; SSE2:       # %bb.0:
466; SSE2-NEXT:    movdqa (%rdi), %xmm0
467; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
468; SSE2-NEXT:    paddb (%rsi), %xmm0
469; SSE2-NEXT:    paddb 16(%rsi), %xmm1
470; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
471; SSE2-NEXT:    pand %xmm2, %xmm1
472; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
473; SSE2-NEXT:    pandn %xmm0, %xmm2
474; SSE2-NEXT:    por %xmm1, %xmm2
475; SSE2-NEXT:    paddb (%rdx), %xmm2
476; SSE2-NEXT:    movdqa %xmm2, (%rcx)
477; SSE2-NEXT:    retq
478;
479; SSE42-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
480; SSE42:       # %bb.0:
481; SSE42-NEXT:    movdqa (%rdi), %xmm0
482; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
483; SSE42-NEXT:    paddb 16(%rsi), %xmm1
484; SSE42-NEXT:    paddb (%rsi), %xmm0
485; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
486; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
487; SSE42-NEXT:    paddb (%rdx), %xmm0
488; SSE42-NEXT:    movdqa %xmm0, (%rcx)
489; SSE42-NEXT:    retq
490;
491; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
492; AVX:       # %bb.0:
493; AVX-NEXT:    vmovdqa (%rdi), %xmm0
494; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
495; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
496; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
497; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
498; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
499; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
500; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
501; AVX-NEXT:    retq
502;
503; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
504; AVX2:       # %bb.0:
505; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
506; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
507; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
508; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
509; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
510; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
511; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
512; AVX2-NEXT:    vzeroupper
513; AVX2-NEXT:    retq
514;
515; AVX512F-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
516; AVX512F:       # %bb.0:
517; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
518; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
519; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
520; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
521; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
522; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
523; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
524; AVX512F-NEXT:    vzeroupper
525; AVX512F-NEXT:    retq
526;
527; AVX512DQ-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
528; AVX512DQ:       # %bb.0:
529; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
530; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
531; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
532; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
533; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
534; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
535; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
536; AVX512DQ-NEXT:    vzeroupper
537; AVX512DQ-NEXT:    retq
538;
539; AVX512BW-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
540; AVX512BW:       # %bb.0:
541; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
542; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
543; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
544; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
545; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
546; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
547; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
548; AVX512BW-NEXT:    vzeroupper
549; AVX512BW-NEXT:    retq
550  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
551  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
552  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
553  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
554  %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
555  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
556  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
557  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
558  ret void
559}
560
561define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
562; SSE2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
563; SSE2:       # %bb.0:
564; SSE2-NEXT:    movdqa (%rdi), %xmm0
565; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
566; SSE2-NEXT:    paddb (%rsi), %xmm0
567; SSE2-NEXT:    paddb 16(%rsi), %xmm1
568; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
569; SSE2-NEXT:    pand %xmm2, %xmm1
570; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
571; SSE2-NEXT:    pandn %xmm0, %xmm2
572; SSE2-NEXT:    por %xmm1, %xmm2
573; SSE2-NEXT:    paddb (%rdx), %xmm2
574; SSE2-NEXT:    movdqa %xmm2, (%rcx)
575; SSE2-NEXT:    retq
576;
577; SSE42-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
578; SSE42:       # %bb.0:
579; SSE42-NEXT:    movdqa (%rdi), %xmm0
580; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
581; SSE42-NEXT:    paddb 16(%rsi), %xmm1
582; SSE42-NEXT:    paddb (%rsi), %xmm0
583; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
584; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
585; SSE42-NEXT:    paddb (%rdx), %xmm0
586; SSE42-NEXT:    movdqa %xmm0, (%rcx)
587; SSE42-NEXT:    retq
588;
589; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
590; AVX:       # %bb.0:
591; AVX-NEXT:    vmovdqa (%rdi), %xmm0
592; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
593; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
594; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
595; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
596; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
597; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
598; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
599; AVX-NEXT:    retq
600;
601; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
602; AVX2:       # %bb.0:
603; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
604; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
605; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
606; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
607; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
608; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
609; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
610; AVX2-NEXT:    vzeroupper
611; AVX2-NEXT:    retq
612;
613; AVX512F-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
614; AVX512F:       # %bb.0:
615; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
616; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
617; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
618; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
619; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
620; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
621; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
622; AVX512F-NEXT:    vzeroupper
623; AVX512F-NEXT:    retq
624;
625; AVX512DQ-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
626; AVX512DQ:       # %bb.0:
627; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
628; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
629; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
630; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
631; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
632; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
633; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
634; AVX512DQ-NEXT:    vzeroupper
635; AVX512DQ-NEXT:    retq
636;
637; AVX512BW-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
638; AVX512BW:       # %bb.0:
639; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
640; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
641; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
642; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
643; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
644; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
645; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
646; AVX512BW-NEXT:    vzeroupper
647; AVX512BW-NEXT:    retq
648  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
649  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
650  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
651  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
652  %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
653  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
654  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
655  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
656  ret void
657}
658
659define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
660; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
661; SSE2:       # %bb.0:
662; SSE2-NEXT:    movdqa (%rdi), %xmm0
663; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
664; SSE2-NEXT:    paddb 16(%rsi), %xmm1
665; SSE2-NEXT:    paddb (%rsi), %xmm0
666; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
667; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
668; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
669; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
670; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
671; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672; SSE2-NEXT:    paddb (%rdx), %xmm0
673; SSE2-NEXT:    movdqa %xmm0, (%rcx)
674; SSE2-NEXT:    retq
675;
676; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
677; SSE42:       # %bb.0:
678; SSE42-NEXT:    movdqa (%rdi), %xmm0
679; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
680; SSE42-NEXT:    paddb (%rsi), %xmm0
681; SSE42-NEXT:    paddb 16(%rsi), %xmm1
682; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
683; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
684; SSE42-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685; SSE42-NEXT:    paddb (%rdx), %xmm0
686; SSE42-NEXT:    movdqa %xmm0, (%rcx)
687; SSE42-NEXT:    retq
688;
689; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
690; AVX:       # %bb.0:
691; AVX-NEXT:    vmovdqa (%rdi), %xmm0
692; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
693; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
694; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
695; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
696; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
697; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
698; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
699; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
700; AVX-NEXT:    retq
701;
702; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
703; AVX2:       # %bb.0:
704; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
705; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
706; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
707; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
708; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
709; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
710; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
711; AVX2-NEXT:    vzeroupper
712; AVX2-NEXT:    retq
713;
714; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
715; AVX512F:       # %bb.0:
716; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
717; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
718; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
719; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
720; AVX512F-NEXT:    vmovd %xmm0, %eax
721; AVX512F-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm0
722; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
723; AVX512F-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
724; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
725; AVX512F-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
726; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
727; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
728; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
729; AVX512F-NEXT:    vzeroupper
730; AVX512F-NEXT:    retq
731;
732; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
733; AVX512DQ:       # %bb.0:
734; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
735; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
736; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
737; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
738; AVX512DQ-NEXT:    vmovd %xmm0, %eax
739; AVX512DQ-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm0
740; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
741; AVX512DQ-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
742; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
743; AVX512DQ-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
744; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
745; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
746; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
747; AVX512DQ-NEXT:    vzeroupper
748; AVX512DQ-NEXT:    retq
749;
750; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751; AVX512BW-SLOW:       # %bb.0:
752; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
753; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
755; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
756; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
757; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
758; AVX512BW-SLOW-NEXT:    vzeroupper
759; AVX512BW-SLOW-NEXT:    retq
760;
761; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
762; AVX512BW-FAST:       # %bb.0:
763; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
764; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
766; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
767; AVX512BW-FAST-NEXT:    vmovd %xmm0, %eax
768; AVX512BW-FAST-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
769; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
770; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
771; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
772; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
773; AVX512BW-FAST-NEXT:    vzeroupper
774; AVX512BW-FAST-NEXT:    retq
775  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
776  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
777  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
778  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
779  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
780  %out.bytevec = bitcast <8 x i16> %broadcast.of.zextinreg to <16 x i8>
781  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
782  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
783  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
784  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
785  ret void
786}
787
788define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
789; SSE2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
790; SSE2:       # %bb.0:
791; SSE2-NEXT:    movdqa (%rdi), %xmm0
792; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
793; SSE2-NEXT:    paddb (%rsi), %xmm0
794; SSE2-NEXT:    paddb 16(%rsi), %xmm1
795; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
796; SSE2-NEXT:    pand %xmm2, %xmm1
797; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
798; SSE2-NEXT:    pandn %xmm0, %xmm2
799; SSE2-NEXT:    por %xmm1, %xmm2
800; SSE2-NEXT:    paddb (%rdx), %xmm2
801; SSE2-NEXT:    movdqa %xmm2, (%rcx)
802; SSE2-NEXT:    retq
803;
804; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
805; SSE42:       # %bb.0:
806; SSE42-NEXT:    movdqa (%rdi), %xmm0
807; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
808; SSE42-NEXT:    paddb 16(%rsi), %xmm1
809; SSE42-NEXT:    paddb (%rsi), %xmm0
810; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
811; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
812; SSE42-NEXT:    paddb (%rdx), %xmm0
813; SSE42-NEXT:    movdqa %xmm0, (%rcx)
814; SSE42-NEXT:    retq
815;
816; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
817; AVX:       # %bb.0:
818; AVX-NEXT:    vmovdqa (%rdi), %xmm0
819; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
820; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
821; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
822; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
823; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
824; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
825; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
826; AVX-NEXT:    retq
827;
828; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
829; AVX2:       # %bb.0:
830; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
831; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
832; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
833; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
834; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
835; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
836; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
837; AVX2-NEXT:    vzeroupper
838; AVX2-NEXT:    retq
839;
840; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
841; AVX512F:       # %bb.0:
842; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
843; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
844; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
845; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
846; AVX512F-NEXT:    vmovd %xmm0, %eax
847; AVX512F-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm0
848; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
849; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
850; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
851; AVX512F-NEXT:    vzeroupper
852; AVX512F-NEXT:    retq
853;
854; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
855; AVX512DQ:       # %bb.0:
856; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
857; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
858; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
859; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
860; AVX512DQ-NEXT:    vmovd %xmm0, %eax
861; AVX512DQ-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm0
862; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
863; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
864; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
865; AVX512DQ-NEXT:    vzeroupper
866; AVX512DQ-NEXT:    retq
867;
868; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
869; AVX512BW-SLOW:       # %bb.0:
870; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
871; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
873; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
874; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
875; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
877; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
878; AVX512BW-SLOW-NEXT:    vzeroupper
879; AVX512BW-SLOW-NEXT:    retq
880;
881; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
882; AVX512BW-FAST:       # %bb.0:
883; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
884; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
886; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
887; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
888; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
890; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
891; AVX512BW-FAST-NEXT:    vzeroupper
892; AVX512BW-FAST-NEXT:    retq
893  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
894  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
895  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
896  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
897  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
898  %out.bytevec = bitcast <8 x i16> %broadcast.of.zextinreg to <16 x i8>
899  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
900  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
901  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
902  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
903  ret void
904}
905
906define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
907; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
908; SSE2:       # %bb.0:
909; SSE2-NEXT:    movdqa (%rdi), %xmm0
910; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
911; SSE2-NEXT:    paddb (%rsi), %xmm0
912; SSE2-NEXT:    paddb 16(%rsi), %xmm1
913; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
914; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
915; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
916; SSE2-NEXT:    paddb (%rdx), %xmm0
917; SSE2-NEXT:    movdqa %xmm0, (%rcx)
918; SSE2-NEXT:    retq
919;
920; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
921; SSE42:       # %bb.0:
922; SSE42-NEXT:    movdqa (%rdi), %xmm0
923; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
924; SSE42-NEXT:    paddb 16(%rsi), %xmm1
925; SSE42-NEXT:    paddb (%rsi), %xmm0
926; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
927; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
928; SSE42-NEXT:    paddb (%rdx), %xmm0
929; SSE42-NEXT:    movdqa %xmm0, (%rcx)
930; SSE42-NEXT:    retq
931;
932; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
933; AVX:       # %bb.0:
934; AVX-NEXT:    vmovdqa (%rdi), %xmm0
935; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
936; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
937; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
938; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
939; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
940; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
941; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
942; AVX-NEXT:    retq
943;
944; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
945; AVX2-SLOW:       # %bb.0:
946; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
947; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
948; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
949; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
950; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
951; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
952; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
953; AVX2-SLOW-NEXT:    vzeroupper
954; AVX2-SLOW-NEXT:    retq
955;
956; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
957; AVX2-FAST-PERLANE:       # %bb.0:
958; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
959; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
960; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
961; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd %xmm0, %xmm0
962; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
963; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
964; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
965; AVX2-FAST-PERLANE-NEXT:    vzeroupper
966; AVX2-FAST-PERLANE-NEXT:    retq
967;
968; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
969; AVX2-FAST:       # %bb.0:
970; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
971; AVX2-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
972; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
973; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
974; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
975; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
976; AVX2-FAST-NEXT:    vzeroupper
977; AVX2-FAST-NEXT:    retq
978;
979; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
980; AVX512F:       # %bb.0:
981; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
982; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
983; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
984; AVX512F-NEXT:    vpermd %ymm1, %ymm0, %ymm0
985; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
986; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
987; AVX512F-NEXT:    vzeroupper
988; AVX512F-NEXT:    retq
989;
990; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
991; AVX512DQ:       # %bb.0:
992; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
993; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
994; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
995; AVX512DQ-NEXT:    vpermd %ymm1, %ymm0, %ymm0
996; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
997; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
998; AVX512DQ-NEXT:    vzeroupper
999; AVX512DQ-NEXT:    retq
1000;
1001; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
1002; AVX512BW:       # %bb.0:
1003; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1004; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
1005; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1006; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1007; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1008; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1009; AVX512BW-NEXT:    vzeroupper
1010; AVX512BW-NEXT:    retq
1011  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1012  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1013  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1014  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
1015  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
1016  %out.bytevec = bitcast <4 x i32> %broadcast.of.zextinreg to <16 x i8>
1017  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1018  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1019  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1020  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1021  ret void
1022}
1023
1024define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1025; SSE2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1026; SSE2:       # %bb.0:
1027; SSE2-NEXT:    movdqa (%rdi), %xmm0
1028; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1029; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1030; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1031; SSE2-NEXT:    paddb (%rsi), %xmm0
1032; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1033; SSE2-NEXT:    psrlw $8, %xmm1
1034; SSE2-NEXT:    packuswb %xmm1, %xmm1
1035; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1036; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1037; SSE2-NEXT:    movdqa %xmm0, %xmm3
1038; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1039; SSE2-NEXT:    psrlw $8, %xmm2
1040; SSE2-NEXT:    packuswb %xmm2, %xmm2
1041; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1042; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1043; SSE2-NEXT:    paddb (%rdx), %xmm3
1044; SSE2-NEXT:    movdqa %xmm3, (%rcx)
1045; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1046; SSE2-NEXT:    retq
1047;
1048; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1049; SSE42:       # %bb.0:
1050; SSE42-NEXT:    movdqa (%rdi), %xmm0
1051; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1052; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1053; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1054; SSE42-NEXT:    paddb (%rsi), %xmm0
1055; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1056; SSE42-NEXT:    movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
1057; SSE42-NEXT:    pshufb %xmm3, %xmm1
1058; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1059; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1060; SSE42-NEXT:    movdqa %xmm0, %xmm4
1061; SSE42-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1062; SSE42-NEXT:    pshufb %xmm3, %xmm2
1063; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1064; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1065; SSE42-NEXT:    paddb (%rdx), %xmm4
1066; SSE42-NEXT:    movdqa %xmm4, (%rcx)
1067; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1068; SSE42-NEXT:    retq
1069;
1070; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1071; AVX:       # %bb.0:
1072; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1073; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1074; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1075; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1076; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1077; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1078; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
1079; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1080; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1081; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1082; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1083; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1084; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1085; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1086; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1087; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1088; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1089; AVX-NEXT:    retq
1090;
1091; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1092; AVX2:       # %bb.0:
1093; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1094; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1095; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1096; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1097; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1098; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
1099; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1100; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1101; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1102; AVX2-NEXT:    vzeroupper
1103; AVX2-NEXT:    retq
1104;
1105; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1106; AVX512F:       # %bb.0:
1107; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1108; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1109; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1110; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1111; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1112; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
1113; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1114; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1115; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1116; AVX512F-NEXT:    vzeroupper
1117; AVX512F-NEXT:    retq
1118;
1119; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1120; AVX512DQ:       # %bb.0:
1121; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1122; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1123; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1124; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1125; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1126; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
1127; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1128; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1129; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1130; AVX512DQ-NEXT:    vzeroupper
1131; AVX512DQ-NEXT:    retq
1132;
1133; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1134; AVX512BW:       # %bb.0:
1135; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1136; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1137; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1138; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1139; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
1140; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1141; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1142; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1143; AVX512BW-NEXT:    vzeroupper
1144; AVX512BW-NEXT:    retq
1145  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1146  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1147  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1148  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
1149  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1150  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1151  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1152  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1153  ret void
1154}
1155
1156define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1157; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1158; SSE2:       # %bb.0:
1159; SSE2-NEXT:    movdqa (%rdi), %xmm0
1160; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1161; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1162; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1163; SSE2-NEXT:    paddb (%rsi), %xmm0
1164; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1165; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1166; SSE2-NEXT:    pand %xmm3, %xmm1
1167; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1168; SSE2-NEXT:    pand %xmm3, %xmm2
1169; SSE2-NEXT:    pandn %xmm0, %xmm3
1170; SSE2-NEXT:    por %xmm3, %xmm1
1171; SSE2-NEXT:    por %xmm2, %xmm3
1172; SSE2-NEXT:    paddb 16(%rdx), %xmm3
1173; SSE2-NEXT:    paddb (%rdx), %xmm1
1174; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1175; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
1176; SSE2-NEXT:    retq
1177;
1178; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1179; SSE42:       # %bb.0:
1180; SSE42-NEXT:    movdqa (%rdi), %xmm0
1181; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1182; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1183; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1184; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1185; SSE42-NEXT:    paddb (%rsi), %xmm0
1186; SSE42-NEXT:    movdqa %xmm0, %xmm3
1187; SSE42-NEXT:    palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1188; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1189; SSE42-NEXT:    pshufb %xmm1, %xmm3
1190; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1191; SSE42-NEXT:    pshufb %xmm1, %xmm0
1192; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1193; SSE42-NEXT:    paddb (%rdx), %xmm3
1194; SSE42-NEXT:    movdqa %xmm3, (%rcx)
1195; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1196; SSE42-NEXT:    retq
1197;
1198; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1199; AVX:       # %bb.0:
1200; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1201; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1202; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1203; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1204; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1205; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1206; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1207; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1208; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1209; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1210; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1211; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1212; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1213; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1214; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1215; AVX-NEXT:    retq
1216;
1217; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1218; AVX2:       # %bb.0:
1219; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1220; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1221; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1222; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1223; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
1224; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1225; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1226; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1227; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1228; AVX2-NEXT:    vzeroupper
1229; AVX2-NEXT:    retq
1230;
1231; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1232; AVX512F:       # %bb.0:
1233; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1234; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1235; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1236; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1237; AVX512F-NEXT:    vpbroadcastd %xmm1, %ymm1
1238; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1239; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1240; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1241; AVX512F-NEXT:    vzeroupper
1242; AVX512F-NEXT:    retq
1243;
1244; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1245; AVX512DQ:       # %bb.0:
1246; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1247; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1248; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1249; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1250; AVX512DQ-NEXT:    vpbroadcastd %xmm1, %ymm1
1251; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1252; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1253; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1254; AVX512DQ-NEXT:    vzeroupper
1255; AVX512DQ-NEXT:    retq
1256;
1257; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1258; AVX512BW:       # %bb.0:
1259; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1260; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1261; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1262; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm0
1263; AVX512BW-NEXT:    movl $286331153, %eax # imm = 0x11111111
1264; AVX512BW-NEXT:    kmovd %eax, %k1
1265; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
1266; AVX512BW-NEXT:    vpaddb (%rdx), %zmm1, %zmm0
1267; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1268; AVX512BW-NEXT:    vzeroupper
1269; AVX512BW-NEXT:    retq
1270  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1271  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1272  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1273  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
1274  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1275  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1276  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1277  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1278  ret void
1279}
1280
1281define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1282; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1283; SSE2:       # %bb.0:
1284; SSE2-NEXT:    movdqa (%rdi), %xmm0
1285; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1286; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1287; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1288; SSE2-NEXT:    paddb (%rsi), %xmm0
1289; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1290; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1291; SSE2-NEXT:    pand %xmm3, %xmm1
1292; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1293; SSE2-NEXT:    pand %xmm3, %xmm2
1294; SSE2-NEXT:    pandn %xmm0, %xmm3
1295; SSE2-NEXT:    por %xmm3, %xmm1
1296; SSE2-NEXT:    por %xmm2, %xmm3
1297; SSE2-NEXT:    paddb 16(%rdx), %xmm3
1298; SSE2-NEXT:    paddb (%rdx), %xmm1
1299; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1300; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
1301; SSE2-NEXT:    retq
1302;
1303; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1304; SSE42:       # %bb.0:
1305; SSE42-NEXT:    movdqa (%rdi), %xmm0
1306; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1307; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1308; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1309; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1310; SSE42-NEXT:    paddb (%rsi), %xmm0
1311; SSE42-NEXT:    movdqa %xmm0, %xmm3
1312; SSE42-NEXT:    palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1313; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1314; SSE42-NEXT:    pshufb %xmm1, %xmm3
1315; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1316; SSE42-NEXT:    pshufb %xmm1, %xmm0
1317; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1318; SSE42-NEXT:    paddb (%rdx), %xmm3
1319; SSE42-NEXT:    movdqa %xmm3, (%rcx)
1320; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1321; SSE42-NEXT:    retq
1322;
1323; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1324; AVX:       # %bb.0:
1325; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1326; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1327; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1328; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1329; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1330; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1331; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1332; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1333; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1334; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1335; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1336; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1337; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1338; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1339; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1340; AVX-NEXT:    retq
1341;
1342; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1343; AVX2:       # %bb.0:
1344; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1345; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1346; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1347; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1348; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
1349; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1350; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1351; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1352; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1353; AVX2-NEXT:    vzeroupper
1354; AVX2-NEXT:    retq
1355;
1356; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1357; AVX512F:       # %bb.0:
1358; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1359; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1360; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1361; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1362; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
1363; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1364; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1365; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1366; AVX512F-NEXT:    vzeroupper
1367; AVX512F-NEXT:    retq
1368;
1369; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1370; AVX512DQ:       # %bb.0:
1371; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1372; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1373; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1374; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1375; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
1376; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1377; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1378; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1379; AVX512DQ-NEXT:    vzeroupper
1380; AVX512DQ-NEXT:    retq
1381;
1382; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1383; AVX512BW:       # %bb.0:
1384; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1385; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1386; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1387; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm0
1388; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
1389; AVX512BW-NEXT:    kmovd %eax, %k1
1390; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
1391; AVX512BW-NEXT:    vpaddb (%rdx), %zmm1, %zmm0
1392; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1393; AVX512BW-NEXT:    vzeroupper
1394; AVX512BW-NEXT:    retq
1395  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1396  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1397  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1398  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1399  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1400  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1401  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1402  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1403  ret void
1404}
1405
1406define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1407; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1408; SSE2:       # %bb.0:
1409; SSE2-NEXT:    movdqa (%rdi), %xmm0
1410; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1411; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1412; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1413; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1414; SSE2-NEXT:    paddb (%rsi), %xmm0
1415; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1416; SSE2-NEXT:    pand %xmm3, %xmm1
1417; SSE2-NEXT:    pand %xmm3, %xmm2
1418; SSE2-NEXT:    pandn %xmm0, %xmm3
1419; SSE2-NEXT:    por %xmm3, %xmm1
1420; SSE2-NEXT:    por %xmm3, %xmm2
1421; SSE2-NEXT:    paddb 16(%rdx), %xmm2
1422; SSE2-NEXT:    paddb (%rdx), %xmm1
1423; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1424; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
1425; SSE2-NEXT:    retq
1426;
1427; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1428; SSE42:       # %bb.0:
1429; SSE42-NEXT:    movdqa (%rdi), %xmm1
1430; SSE42-NEXT:    movdqa 32(%rdi), %xmm2
1431; SSE42-NEXT:    movdqa 48(%rdi), %xmm3
1432; SSE42-NEXT:    paddb 48(%rsi), %xmm3
1433; SSE42-NEXT:    paddb 32(%rsi), %xmm2
1434; SSE42-NEXT:    paddb (%rsi), %xmm1
1435; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1436; SSE42-NEXT:    movdqa %xmm1, %xmm4
1437; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm4
1438; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1439; SSE42-NEXT:    paddb 16(%rdx), %xmm1
1440; SSE42-NEXT:    paddb (%rdx), %xmm4
1441; SSE42-NEXT:    movdqa %xmm4, (%rcx)
1442; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
1443; SSE42-NEXT:    retq
1444;
1445; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1446; AVX:       # %bb.0:
1447; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1448; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1449; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1450; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1451; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1452; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1453; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615]
1454; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm1
1455; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
1456; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1457; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1458; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1459; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1460; AVX-NEXT:    retq
1461;
1462; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1463; AVX2:       # %bb.0:
1464; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1465; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1466; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1467; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1468; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1469; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
1470; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1471; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1472; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1473; AVX2-NEXT:    vzeroupper
1474; AVX2-NEXT:    retq
1475;
1476; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1477; AVX512F:       # %bb.0:
1478; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1479; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1480; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1481; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1482; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
1485; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1486; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
1487; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1488; AVX512F-NEXT:    vzeroupper
1489; AVX512F-NEXT:    retq
1490;
1491; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1492; AVX512DQ:       # %bb.0:
1493; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1494; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
1495; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1496; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1497; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
1500; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1501; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
1502; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1503; AVX512DQ-NEXT:    vzeroupper
1504; AVX512DQ-NEXT:    retq
1505;
1506; AVX512BW-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1507; AVX512BW:       # %bb.0:
1508; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1509; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1510; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1511; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1512; AVX512BW-NEXT:    movl $65537, %eax # imm = 0x10001
1513; AVX512BW-NEXT:    kmovd %eax, %k1
1514; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
1515; AVX512BW-NEXT:    vpaddb (%rdx), %zmm1, %zmm0
1516; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1517; AVX512BW-NEXT:    vzeroupper
1518; AVX512BW-NEXT:    retq
1519  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1520  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1521  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1522  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1523  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1524  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1525  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1526  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1527  ret void
1528}
1529
1530define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1531; SSE2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1532; SSE2:       # %bb.0:
1533; SSE2-NEXT:    movdqa (%rdi), %xmm0
1534; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1535; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1536; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1537; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1538; SSE2-NEXT:    paddb (%rsi), %xmm0
1539; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1540; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1541; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1542; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1543; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1544; SSE2-NEXT:    movdqa %xmm0, %xmm3
1545; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1546; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7]
1547; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1548; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1549; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1550; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1551; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1552; SSE2-NEXT:    paddb (%rdx), %xmm3
1553; SSE2-NEXT:    movdqa %xmm3, (%rcx)
1554; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1555; SSE2-NEXT:    retq
1556;
1557; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1558; SSE42:       # %bb.0:
1559; SSE42-NEXT:    movdqa (%rdi), %xmm0
1560; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1561; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1562; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1563; SSE42-NEXT:    paddb (%rsi), %xmm0
1564; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1565; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1566; SSE42-NEXT:    pshufb %xmm3, %xmm1
1567; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1568; SSE42-NEXT:    movdqa %xmm0, %xmm4
1569; SSE42-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1570; SSE42-NEXT:    pshufb %xmm3, %xmm2
1571; SSE42-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1572; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1573; SSE42-NEXT:    paddb (%rdx), %xmm4
1574; SSE42-NEXT:    movdqa %xmm4, (%rcx)
1575; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1576; SSE42-NEXT:    retq
1577;
1578; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1579; AVX:       # %bb.0:
1580; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1581; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1582; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1583; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1584; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1585; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1586; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1587; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1588; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1589; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1590; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1591; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1592; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1593; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1594; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1595; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1596; AVX-NEXT:    retq
1597;
1598; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1599; AVX2:       # %bb.0:
1600; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1601; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1602; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1603; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1604; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
1605; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1606; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1607; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1608; AVX2-NEXT:    vzeroupper
1609; AVX2-NEXT:    retq
1610;
1611; AVX512F-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1612; AVX512F:       # %bb.0:
1613; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1614; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1615; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1616; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1617; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm1
1618; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1619; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1620; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1621; AVX512F-NEXT:    vzeroupper
1622; AVX512F-NEXT:    retq
1623;
1624; AVX512DQ-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1625; AVX512DQ:       # %bb.0:
1626; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1627; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1628; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1629; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1630; AVX512DQ-NEXT:    vpbroadcastw %xmm1, %ymm1
1631; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1632; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1633; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1634; AVX512DQ-NEXT:    vzeroupper
1635; AVX512DQ-NEXT:    retq
1636;
1637; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1638; AVX512BW:       # %bb.0:
1639; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1640; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1641; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
1642; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
1643; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1644; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1645; AVX512BW-NEXT:    vzeroupper
1646; AVX512BW-NEXT:    retq
1647  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1648  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1649  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1650  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1651  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
1652  %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8>
1653  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1654  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1655  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1656  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1657  ret void
1658}
1659
1660define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1661; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1662; SSE2:       # %bb.0:
1663; SSE2-NEXT:    movdqa (%rdi), %xmm0
1664; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1665; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1666; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1667; SSE2-NEXT:    paddb (%rsi), %xmm0
1668; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1669; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
1670; SSE2-NEXT:    pand %xmm3, %xmm1
1671; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1672; SSE2-NEXT:    pand %xmm3, %xmm2
1673; SSE2-NEXT:    pandn %xmm0, %xmm3
1674; SSE2-NEXT:    por %xmm3, %xmm1
1675; SSE2-NEXT:    por %xmm2, %xmm3
1676; SSE2-NEXT:    paddb 16(%rdx), %xmm3
1677; SSE2-NEXT:    paddb (%rdx), %xmm1
1678; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1679; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
1680; SSE2-NEXT:    retq
1681;
1682; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1683; SSE42:       # %bb.0:
1684; SSE42-NEXT:    movdqa (%rdi), %xmm0
1685; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1686; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1687; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1688; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1689; SSE42-NEXT:    paddb (%rsi), %xmm0
1690; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1691; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1692; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1693; SSE42-NEXT:    paddb 16(%rdx), %xmm2
1694; SSE42-NEXT:    paddb (%rdx), %xmm1
1695; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1696; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
1697; SSE42-NEXT:    retq
1698;
1699; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1700; AVX:       # %bb.0:
1701; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1702; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1703; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1704; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1705; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1706; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1707; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1708; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1709; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1710; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1711; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1712; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1713; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1714; AVX-NEXT:    retq
1715;
1716; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1717; AVX2:       # %bb.0:
1718; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1719; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1720; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1721; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1722; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
1723; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1724; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1725; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1726; AVX2-NEXT:    vzeroupper
1727; AVX2-NEXT:    retq
1728;
1729; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1730; AVX512F:       # %bb.0:
1731; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1732; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1733; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1734; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1735; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
1736; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1737; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1738; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1739; AVX512F-NEXT:    vzeroupper
1740; AVX512F-NEXT:    retq
1741;
1742; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1743; AVX512DQ:       # %bb.0:
1744; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1745; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1746; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1747; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1748; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
1749; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1750; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1751; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1752; AVX512DQ-NEXT:    vzeroupper
1753; AVX512DQ-NEXT:    retq
1754;
1755; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1756; AVX512BW:       # %bb.0:
1757; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1758; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1759; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1760; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
1761; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1762; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
1763; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1764; AVX512BW-NEXT:    vzeroupper
1765; AVX512BW-NEXT:    retq
1766  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1767  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1768  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1769  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1770  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
1771  %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8>
1772  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1773  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1774  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1775  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1776  ret void
1777}
1778
1779define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1780; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1781; SSE2:       # %bb.0:
1782; SSE2-NEXT:    movdqa (%rdi), %xmm0
1783; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1784; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1785; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1786; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1787; SSE2-NEXT:    paddb (%rsi), %xmm0
1788; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
1789; SSE2-NEXT:    pand %xmm3, %xmm1
1790; SSE2-NEXT:    pand %xmm3, %xmm2
1791; SSE2-NEXT:    pandn %xmm0, %xmm3
1792; SSE2-NEXT:    por %xmm3, %xmm1
1793; SSE2-NEXT:    por %xmm3, %xmm2
1794; SSE2-NEXT:    paddb 16(%rdx), %xmm2
1795; SSE2-NEXT:    paddb (%rdx), %xmm1
1796; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1797; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
1798; SSE2-NEXT:    retq
1799;
1800; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1801; SSE42:       # %bb.0:
1802; SSE42-NEXT:    movdqa (%rdi), %xmm0
1803; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1804; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1805; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1806; SSE42-NEXT:    paddb (%rsi), %xmm0
1807; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1808; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1809; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1810; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1811; SSE42-NEXT:    paddb (%rdx), %xmm1
1812; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1813; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1814; SSE42-NEXT:    retq
1815;
1816; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1817; AVX:       # %bb.0:
1818; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1819; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1820; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1821; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1822; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1823; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1824; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1825; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1826; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1827; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1828; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1829; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1830; AVX-NEXT:    retq
1831;
1832; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1833; AVX2:       # %bb.0:
1834; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1835; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1836; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1837; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1838; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1839; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1840; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1841; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1842; AVX2-NEXT:    vzeroupper
1843; AVX2-NEXT:    retq
1844;
1845; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1846; AVX512F:       # %bb.0:
1847; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1848; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1849; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1850; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1851; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1852; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1853; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1854; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1855; AVX512F-NEXT:    vzeroupper
1856; AVX512F-NEXT:    retq
1857;
1858; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1859; AVX512DQ:       # %bb.0:
1860; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1861; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
1862; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1863; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1864; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1865; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1866; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1867; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1868; AVX512DQ-NEXT:    vzeroupper
1869; AVX512DQ-NEXT:    retq
1870;
1871; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1872; AVX512BW:       # %bb.0:
1873; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1874; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1875; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1876; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
1877; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1878; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
1879; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1880; AVX512BW-NEXT:    vzeroupper
1881; AVX512BW-NEXT:    retq
1882  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1883  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1884  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1885  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1886  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1887  %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8>
1888  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1889  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1890  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1891  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1892  ret void
1893}
1894
1895define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1896; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1897; SSE2:       # %bb.0:
1898; SSE2-NEXT:    movdqa (%rdi), %xmm0
1899; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1900; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1901; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1902; SSE2-NEXT:    paddb (%rsi), %xmm0
1903; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1904; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1905; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1906; SSE2-NEXT:    movdqa %xmm0, %xmm3
1907; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1908; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
1909; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1910; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1911; SSE2-NEXT:    paddb (%rdx), %xmm3
1912; SSE2-NEXT:    movdqa %xmm3, (%rcx)
1913; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1914; SSE2-NEXT:    retq
1915;
1916; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1917; SSE42:       # %bb.0:
1918; SSE42-NEXT:    movdqa (%rdi), %xmm0
1919; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1920; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1921; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1922; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1923; SSE42-NEXT:    paddb (%rsi), %xmm0
1924; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1925; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1926; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1927; SSE42-NEXT:    paddb 16(%rdx), %xmm2
1928; SSE42-NEXT:    paddb (%rdx), %xmm1
1929; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1930; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
1931; SSE42-NEXT:    retq
1932;
1933; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1934; AVX:       # %bb.0:
1935; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1936; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1937; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1938; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1939; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1940; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1941; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1942; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1943; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
1944; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1945; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1946; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
1947; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1948; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1949; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
1950; AVX-NEXT:    vzeroupper
1951; AVX-NEXT:    retq
1952;
1953; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1954; AVX2:       # %bb.0:
1955; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1956; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1957; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1958; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1959; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
1960; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1961; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1962; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1963; AVX2-NEXT:    vzeroupper
1964; AVX2-NEXT:    retq
1965;
1966; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1967; AVX512F-SLOW:       # %bb.0:
1968; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
1969; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1970; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
1971; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1972; AVX512F-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
1973; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1974; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1975; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
1976; AVX512F-SLOW-NEXT:    vzeroupper
1977; AVX512F-SLOW-NEXT:    retq
1978;
1979; AVX512F-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1980; AVX512F-FAST:       # %bb.0:
1981; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm0
1982; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1983; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm1
1984; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1985; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
1986; AVX512F-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
1987; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
1988; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
1989; AVX512F-FAST-NEXT:    vzeroupper
1990; AVX512F-FAST-NEXT:    retq
1991;
1992; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1993; AVX512DQ-SLOW:       # %bb.0:
1994; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
1995; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1996; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
1997; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1998; AVX512DQ-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
1999; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
2000; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2001; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2002; AVX512DQ-SLOW-NEXT:    vzeroupper
2003; AVX512DQ-SLOW-NEXT:    retq
2004;
2005; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2006; AVX512DQ-FAST:       # %bb.0:
2007; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm0
2008; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2009; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm1
2010; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2011; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
2012; AVX512DQ-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2013; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2014; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2015; AVX512DQ-FAST-NEXT:    vzeroupper
2016; AVX512DQ-FAST-NEXT:    retq
2017;
2018; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2019; AVX512BW-SLOW:       # %bb.0:
2020; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
2021; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2022; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2023; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
2024; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2025; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2026; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2027; AVX512BW-SLOW-NEXT:    vzeroupper
2028; AVX512BW-SLOW-NEXT:    retq
2029;
2030; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2031; AVX512BW-FAST:       # %bb.0:
2032; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
2033; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
2034; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2035; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
2036; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2037; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
2038; AVX512BW-FAST-NEXT:    vzeroupper
2039; AVX512BW-FAST-NEXT:    retq
2040  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2041  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2042  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2043  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2044  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
2045  %out.bytevec = bitcast <8 x i32> %broadcast.of.zextinreg to <32 x i8>
2046  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2047  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2048  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2049  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2050  ret void
2051}
2052
2053define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2054; SSE2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2055; SSE2:       # %bb.0:
2056; SSE2-NEXT:    movdqa (%rdi), %xmm0
2057; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
2058; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
2059; SSE2-NEXT:    paddb 48(%rsi), %xmm2
2060; SSE2-NEXT:    paddb (%rsi), %xmm0
2061; SSE2-NEXT:    paddb 32(%rsi), %xmm1
2062; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2063; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2064; SSE2-NEXT:    paddb 16(%rdx), %xmm2
2065; SSE2-NEXT:    paddb (%rdx), %xmm1
2066; SSE2-NEXT:    movdqa %xmm1, (%rcx)
2067; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
2068; SSE2-NEXT:    retq
2069;
2070; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2071; SSE42:       # %bb.0:
2072; SSE42-NEXT:    movdqa (%rdi), %xmm0
2073; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
2074; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2075; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2076; SSE42-NEXT:    paddb (%rsi), %xmm0
2077; SSE42-NEXT:    paddb 32(%rsi), %xmm1
2078; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2079; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2080; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2081; SSE42-NEXT:    paddb (%rdx), %xmm1
2082; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2083; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2084; SSE42-NEXT:    retq
2085;
2086; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2087; AVX:       # %bb.0:
2088; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2089; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
2090; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
2091; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
2092; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
2093; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2094; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2095; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2096; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2097; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
2098; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
2099; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2100; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2101; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
2102; AVX-NEXT:    vzeroupper
2103; AVX-NEXT:    retq
2104;
2105; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2106; AVX2:       # %bb.0:
2107; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2108; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2109; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2110; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2111; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2112; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2113; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2114; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2115; AVX2-NEXT:    vzeroupper
2116; AVX2-NEXT:    retq
2117;
2118; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2119; AVX512F-SLOW:       # %bb.0:
2120; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
2121; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
2122; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2123; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2124; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2125; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2126; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2127; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2128; AVX512F-SLOW-NEXT:    vzeroupper
2129; AVX512F-SLOW-NEXT:    retq
2130;
2131; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2132; AVX512F-FAST:       # %bb.0:
2133; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2134; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2135; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2136; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2137; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2138; AVX512F-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2139; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2140; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2141; AVX512F-FAST-NEXT:    vzeroupper
2142; AVX512F-FAST-NEXT:    retq
2143;
2144; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2145; AVX512DQ-SLOW:       # %bb.0:
2146; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
2147; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
2148; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2149; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2150; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2151; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2152; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2153; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2154; AVX512DQ-SLOW-NEXT:    vzeroupper
2155; AVX512DQ-SLOW-NEXT:    retq
2156;
2157; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2158; AVX512DQ-FAST:       # %bb.0:
2159; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2160; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2161; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2162; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2163; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2164; AVX512DQ-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2165; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2166; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2167; AVX512DQ-FAST-NEXT:    vzeroupper
2168; AVX512DQ-FAST-NEXT:    retq
2169;
2170; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2171; AVX512BW-SLOW:       # %bb.0:
2172; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
2173; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2174; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2175; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2176; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2177; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2178; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2179; AVX512BW-SLOW-NEXT:    vzeroupper
2180; AVX512BW-SLOW-NEXT:    retq
2181;
2182; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2183; AVX512BW-FAST:       # %bb.0:
2184; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
2185; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
2186; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2187; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
2188; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2189; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
2190; AVX512BW-FAST-NEXT:    vzeroupper
2191; AVX512BW-FAST-NEXT:    retq
2192  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2193  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2194  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2195  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2196  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
2197  %out.bytevec = bitcast <8 x i32> %broadcast.of.zextinreg to <32 x i8>
2198  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2199  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2200  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2201  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2202  ret void
2203}
2204
2205define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2206; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2207; SSE2:       # %bb.0:
2208; SSE2-NEXT:    movdqa (%rdi), %xmm0
2209; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
2210; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
2211; SSE2-NEXT:    paddb 48(%rsi), %xmm2
2212; SSE2-NEXT:    paddb (%rsi), %xmm0
2213; SSE2-NEXT:    paddb 32(%rsi), %xmm1
2214; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2215; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2216; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2217; SSE2-NEXT:    paddb (%rdx), %xmm1
2218; SSE2-NEXT:    movdqa %xmm1, (%rcx)
2219; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2220; SSE2-NEXT:    retq
2221;
2222; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2223; SSE42:       # %bb.0:
2224; SSE42-NEXT:    movdqa (%rdi), %xmm0
2225; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
2226; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2227; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2228; SSE42-NEXT:    paddb 32(%rsi), %xmm1
2229; SSE42-NEXT:    paddb (%rsi), %xmm0
2230; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2231; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2232; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2233; SSE42-NEXT:    paddb (%rdx), %xmm1
2234; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2235; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2236; SSE42-NEXT:    retq
2237;
2238; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2239; AVX:       # %bb.0:
2240; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2241; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
2242; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
2243; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
2244; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
2245; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2246; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2247; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2248; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2249; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
2250; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
2251; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2252; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2253; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
2254; AVX-NEXT:    vzeroupper
2255; AVX-NEXT:    retq
2256;
2257; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2258; AVX2:       # %bb.0:
2259; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
2260; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2261; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
2262; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2263; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
2264; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2265; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2266; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2267; AVX2-NEXT:    vzeroupper
2268; AVX2-NEXT:    retq
2269;
2270; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2271; AVX512F-SLOW:       # %bb.0:
2272; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
2273; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2274; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
2275; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2276; AVX512F-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
2277; AVX512F-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2278; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2279; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2280; AVX512F-SLOW-NEXT:    vzeroupper
2281; AVX512F-SLOW-NEXT:    retq
2282;
2283; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2284; AVX512F-FAST:       # %bb.0:
2285; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2286; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2287; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2288; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2289; AVX512F-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7]
2290; AVX512F-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
2291; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2292; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2293; AVX512F-FAST-NEXT:    vzeroupper
2294; AVX512F-FAST-NEXT:    retq
2295;
2296; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2297; AVX512DQ-SLOW:       # %bb.0:
2298; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
2299; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2300; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
2301; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2302; AVX512DQ-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
2303; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2304; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2305; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2306; AVX512DQ-SLOW-NEXT:    vzeroupper
2307; AVX512DQ-SLOW-NEXT:    retq
2308;
2309; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2310; AVX512DQ-FAST:       # %bb.0:
2311; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2312; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2313; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2314; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2315; AVX512DQ-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7]
2316; AVX512DQ-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
2317; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2318; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2319; AVX512DQ-FAST-NEXT:    vzeroupper
2320; AVX512DQ-FAST-NEXT:    retq
2321;
2322; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2323; AVX512BW-SLOW:       # %bb.0:
2324; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
2325; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2326; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2327; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
2328; AVX512BW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2329; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2330; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2331; AVX512BW-SLOW-NEXT:    vzeroupper
2332; AVX512BW-SLOW-NEXT:    retq
2333;
2334; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2335; AVX512BW-FAST:       # %bb.0:
2336; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
2337; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
2338; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2339; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
2340; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2341; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
2342; AVX512BW-FAST-NEXT:    vzeroupper
2343; AVX512BW-FAST-NEXT:    retq
2344  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2345  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2346  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2347  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
2348  %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
2349  %out.bytevec = bitcast <4 x i64> %broadcast.of.zextinreg to <32 x i8>
2350  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2351  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2352  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2353  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2354  ret void
2355}
2356
2357define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2358; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2359; SSE2:       # %bb.0:
2360; SSE2-NEXT:    movdqa (%rdi), %xmm0
2361; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2362; SSE2-NEXT:    paddb (%rsi), %xmm0
2363; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2364; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2365; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,0,0,0,4,5,6,7]
2366; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
2367; SSE2-NEXT:    pand %xmm2, %xmm3
2368; SSE2-NEXT:    pandn %xmm1, %xmm2
2369; SSE2-NEXT:    por %xmm3, %xmm2
2370; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2371; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2372; SSE2-NEXT:    pxor %xmm1, %xmm1
2373; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2374; SSE2-NEXT:    paddb (%rdx), %xmm2
2375; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2376; SSE2-NEXT:    paddb %xmm0, %xmm1
2377; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2378; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2379; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2380; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2381; SSE2-NEXT:    retq
2382;
2383; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2384; SSE42:       # %bb.0:
2385; SSE42-NEXT:    movdqa (%rdi), %xmm1
2386; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2387; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2388; SSE42-NEXT:    paddb (%rsi), %xmm1
2389; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
2390; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
2391; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2392; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
2393; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
2394; SSE42-NEXT:    paddb (%rdx), %xmm2
2395; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
2396; SSE42-NEXT:    paddb %xmm1, %xmm0
2397; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2398; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2399; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2400; SSE42-NEXT:    movdqa %xmm2, (%rcx)
2401; SSE42-NEXT:    retq
2402;
2403; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2404; AVX:       # %bb.0:
2405; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2406; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2407; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2408; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2409; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2410; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
2411; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2412; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
2413; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
2414; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2415; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2416; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2417; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2418; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2419; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2420; AVX-NEXT:    retq
2421;
2422; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2423; AVX2:       # %bb.0:
2424; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2425; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2426; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2427; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2428; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
2429; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero
2430; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
2431; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2432; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
2433; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
2434; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2435; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2436; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2437; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2438; AVX2-NEXT:    vzeroupper
2439; AVX2-NEXT:    retq
2440;
2441; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2442; AVX512F:       # %bb.0:
2443; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2444; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2445; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2446; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2447; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2448; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
2449; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2450; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2451; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2452; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2453; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2454; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2455; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2456; AVX512F-NEXT:    vzeroupper
2457; AVX512F-NEXT:    retq
2458;
2459; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2460; AVX512DQ:       # %bb.0:
2461; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2462; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2463; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2464; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2465; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2466; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
2467; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2468; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2469; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2470; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2471; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2472; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2473; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2474; AVX512DQ-NEXT:    vzeroupper
2475; AVX512DQ-NEXT:    retq
2476;
2477; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2478; AVX512BW:       # %bb.0:
2479; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2480; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2481; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2482; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2483; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
2484; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2485; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2486; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2487; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2488; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2489; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2490; AVX512BW-NEXT:    vzeroupper
2491; AVX512BW-NEXT:    retq
2492  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2493  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2494  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2495  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63, i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95>
2496  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2497  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2498  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2499  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2500  ret void
2501}
2502
2503define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2504; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2505; SSE2:       # %bb.0:
2506; SSE2-NEXT:    movdqa (%rdi), %xmm0
2507; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2508; SSE2-NEXT:    paddb (%rsi), %xmm0
2509; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2510; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2511; SSE2-NEXT:    pand %xmm2, %xmm1
2512; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2513; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2514; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2515; SSE2-NEXT:    pandn %xmm0, %xmm2
2516; SSE2-NEXT:    por %xmm1, %xmm2
2517; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0]
2518; SSE2-NEXT:    pand %xmm0, %xmm1
2519; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2520; SSE2-NEXT:    paddb (%rdx), %xmm2
2521; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2522; SSE2-NEXT:    paddb 32(%rdx), %xmm1
2523; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
2524; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2525; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2526; SSE2-NEXT:    retq
2527;
2528; SSE42-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2529; SSE42:       # %bb.0:
2530; SSE42-NEXT:    movdqa (%rdi), %xmm1
2531; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2532; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2533; SSE42-NEXT:    paddb (%rsi), %xmm1
2534; SSE42-NEXT:    pxor %xmm0, %xmm0
2535; SSE42-NEXT:    movdqa %xmm1, %xmm3
2536; SSE42-NEXT:    pshufb %xmm0, %xmm3
2537; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2538; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
2539; SSE42-NEXT:    movdqa %xmm1, %xmm0
2540; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2541; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
2542; SSE42-NEXT:    paddb (%rdx), %xmm3
2543; SSE42-NEXT:    paddb 16(%rdx), %xmm1
2544; SSE42-NEXT:    paddb 32(%rdx), %xmm0
2545; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
2546; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
2547; SSE42-NEXT:    movdqa %xmm3, (%rcx)
2548; SSE42-NEXT:    retq
2549;
2550; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2551; AVX:       # %bb.0:
2552; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2553; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2554; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2555; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2556; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2557; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
2558; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2559; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
2560; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero
2561; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2562; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2563; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
2564; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
2565; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
2566; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
2567; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2568; AVX-NEXT:    retq
2569;
2570; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2571; AVX2:       # %bb.0:
2572; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
2573; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2574; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
2575; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2576; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm2
2577; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
2578; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2579; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2580; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2581; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
2582; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
2583; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
2584; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2585; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
2586; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
2587; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2588; AVX2-NEXT:    vzeroupper
2589; AVX2-NEXT:    retq
2590;
2591; AVX512F-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2592; AVX512F:       # %bb.0:
2593; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2594; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2595; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2596; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2597; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2598; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2599; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm2
2600; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2601; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2602; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2603; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2604; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2605; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2606; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2607; AVX512F-NEXT:    vzeroupper
2608; AVX512F-NEXT:    retq
2609;
2610; AVX512DQ-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2611; AVX512DQ:       # %bb.0:
2612; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2613; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2614; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2615; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2616; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2617; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2618; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm2
2619; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2620; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2621; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2622; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2623; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2624; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2625; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2626; AVX512DQ-NEXT:    vzeroupper
2627; AVX512DQ-NEXT:    retq
2628;
2629; AVX512BW-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2630; AVX512BW:       # %bb.0:
2631; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2632; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2633; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2634; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2635; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2636; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm2
2637; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2638; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2639; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2640; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2641; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2642; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2643; AVX512BW-NEXT:    vzeroupper
2644; AVX512BW-NEXT:    retq
2645  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2646  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2647  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2648  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 0, i32 52, i32 53, i32 0, i32 55, i32 56, i32 0, i32 58, i32 59, i32 0, i32 61, i32 62, i32 0, i32 64, i32 65, i32 0, i32 67, i32 68, i32 0, i32 70, i32 71, i32 0, i32 73, i32 74, i32 0, i32 76, i32 77, i32 0, i32 79, i32 80, i32 0, i32 82, i32 83, i32 0, i32 85, i32 86, i32 0, i32 88, i32 89, i32 0, i32 91, i32 92, i32 0, i32 94, i32 95>
2649  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2650  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2651  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2652  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2653  ret void
2654}
2655
2656define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2657; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2658; SSE2:       # %bb.0:
2659; SSE2-NEXT:    movdqa (%rdi), %xmm0
2660; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2661; SSE2-NEXT:    paddb (%rsi), %xmm0
2662; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2663; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2664; SSE2-NEXT:    pand %xmm2, %xmm1
2665; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2666; SSE2-NEXT:    pandn %xmm0, %xmm2
2667; SSE2-NEXT:    por %xmm1, %xmm2
2668; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2669; SSE2-NEXT:    paddb (%rdx), %xmm2
2670; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2671; SSE2-NEXT:    paddb %xmm0, %xmm1
2672; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2673; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2674; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2675; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2676; SSE2-NEXT:    retq
2677;
2678; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2679; SSE42:       # %bb.0:
2680; SSE42-NEXT:    movdqa (%rdi), %xmm1
2681; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2682; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2683; SSE42-NEXT:    paddb (%rsi), %xmm1
2684; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
2685; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2686; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
2687; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
2688; SSE42-NEXT:    paddb (%rdx), %xmm3
2689; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
2690; SSE42-NEXT:    paddb %xmm1, %xmm0
2691; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2692; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2693; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2694; SSE42-NEXT:    movdqa %xmm3, (%rcx)
2695; SSE42-NEXT:    retq
2696;
2697; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2698; AVX:       # %bb.0:
2699; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2700; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2701; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2702; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2703; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
2704; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2705; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
2706; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2707; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2708; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2709; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2710; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2711; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2712; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2713; AVX-NEXT:    retq
2714;
2715; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2716; AVX2:       # %bb.0:
2717; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2718; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2719; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2720; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2721; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
2722; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero
2723; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
2724; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2725; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2726; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2727; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2728; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2729; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2730; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2731; AVX2-NEXT:    vzeroupper
2732; AVX2-NEXT:    retq
2733;
2734; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2735; AVX512F:       # %bb.0:
2736; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2737; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2738; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2739; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2740; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
2741; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2742; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm3
2743; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm2
2744; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
2745; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2746; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
2747; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2748; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2749; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2750; AVX512F-NEXT:    vzeroupper
2751; AVX512F-NEXT:    retq
2752;
2753; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2754; AVX512DQ:       # %bb.0:
2755; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2756; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2757; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2758; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2759; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
2760; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2761; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm3
2762; AVX512DQ-NEXT:    vpandn %ymm3, %ymm2, %ymm2
2763; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
2764; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2765; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
2766; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2767; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2768; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2769; AVX512DQ-NEXT:    vzeroupper
2770; AVX512DQ-NEXT:    retq
2771;
2772; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2773; AVX512BW:       # %bb.0:
2774; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2775; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2776; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2777; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm2
2778; AVX512BW-NEXT:    movl $286331153, %eax # imm = 0x11111111
2779; AVX512BW-NEXT:    kmovd %eax, %k1
2780; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
2781; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2782; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2783; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2784; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2785; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2786; AVX512BW-NEXT:    vzeroupper
2787; AVX512BW-NEXT:    retq
2788  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2789  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2790  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2791  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95>
2792  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2793  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2794  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2795  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2796  ret void
2797}
2798
2799define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2800; SSE2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2801; SSE2:       # %bb.0:
2802; SSE2-NEXT:    movdqa (%rdi), %xmm0
2803; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2804; SSE2-NEXT:    paddb (%rsi), %xmm0
2805; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2806; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2807; SSE2-NEXT:    pand %xmm2, %xmm1
2808; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2809; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2810; SSE2-NEXT:    pandn %xmm0, %xmm2
2811; SSE2-NEXT:    por %xmm1, %xmm2
2812; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0]
2813; SSE2-NEXT:    pand %xmm0, %xmm1
2814; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2815; SSE2-NEXT:    paddb (%rdx), %xmm2
2816; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2817; SSE2-NEXT:    paddb 32(%rdx), %xmm1
2818; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
2819; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2820; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2821; SSE2-NEXT:    retq
2822;
2823; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2824; SSE42:       # %bb.0:
2825; SSE42-NEXT:    movdqa (%rdi), %xmm1
2826; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2827; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2828; SSE42-NEXT:    paddb (%rsi), %xmm1
2829; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
2830; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
2831; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2832; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
2833; SSE42-NEXT:    movdqa %xmm1, %xmm0
2834; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2835; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
2836; SSE42-NEXT:    paddb (%rdx), %xmm3
2837; SSE42-NEXT:    paddb 16(%rdx), %xmm1
2838; SSE42-NEXT:    paddb 32(%rdx), %xmm0
2839; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
2840; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
2841; SSE42-NEXT:    movdqa %xmm3, (%rcx)
2842; SSE42-NEXT:    retq
2843;
2844; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2845; AVX:       # %bb.0:
2846; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2847; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2848; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2849; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2850; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2851; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
2852; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2853; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
2854; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero
2855; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2856; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2857; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
2858; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
2859; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
2860; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
2861; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2862; AVX-NEXT:    retq
2863;
2864; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2865; AVX2:       # %bb.0:
2866; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
2867; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2868; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
2869; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2870; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm2
2871; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
2872; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2873; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2874; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2875; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
2876; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
2877; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
2878; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2879; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
2880; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
2881; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2882; AVX2-NEXT:    vzeroupper
2883; AVX2-NEXT:    retq
2884;
2885; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2886; AVX512F:       # %bb.0:
2887; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2888; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2889; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2890; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2891; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2892; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2893; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm2
2894; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2895; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2896; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2897; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2898; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2899; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2900; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2901; AVX512F-NEXT:    vzeroupper
2902; AVX512F-NEXT:    retq
2903;
2904; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2905; AVX512DQ:       # %bb.0:
2906; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2907; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2908; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2909; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2910; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2911; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2912; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm2
2913; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2914; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2915; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2916; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2917; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2918; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2919; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2920; AVX512DQ-NEXT:    vzeroupper
2921; AVX512DQ-NEXT:    retq
2922;
2923; AVX512BW-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2924; AVX512BW:       # %bb.0:
2925; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2926; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2927; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2928; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2929; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2930; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm2
2931; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2932; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2933; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2934; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2935; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2936; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2937; AVX512BW-NEXT:    vzeroupper
2938; AVX512BW-NEXT:    retq
2939  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2940  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2941  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2942  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 0, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 0, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 0, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 0, i32 91, i32 92, i32 93, i32 94, i32 95>
2943  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2944  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2945  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2946  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2947  ret void
2948}
2949
2950define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2951; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2952; SSE2:       # %bb.0:
2953; SSE2-NEXT:    movdqa (%rdi), %xmm0
2954; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2955; SSE2-NEXT:    paddb (%rsi), %xmm0
2956; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2957; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2958; SSE2-NEXT:    pand %xmm2, %xmm1
2959; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2960; SSE2-NEXT:    pandn %xmm0, %xmm2
2961; SSE2-NEXT:    por %xmm1, %xmm2
2962; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2963; SSE2-NEXT:    paddb (%rdx), %xmm2
2964; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2965; SSE2-NEXT:    paddb %xmm0, %xmm1
2966; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2967; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2968; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2969; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2970; SSE2-NEXT:    retq
2971;
2972; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2973; SSE42:       # %bb.0:
2974; SSE42-NEXT:    movdqa (%rdi), %xmm1
2975; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2976; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2977; SSE42-NEXT:    paddb (%rsi), %xmm1
2978; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
2979; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2980; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
2981; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
2982; SSE42-NEXT:    paddb (%rdx), %xmm3
2983; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
2984; SSE42-NEXT:    paddb %xmm1, %xmm0
2985; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2986; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2987; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2988; SSE42-NEXT:    movdqa %xmm3, (%rcx)
2989; SSE42-NEXT:    retq
2990;
2991; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2992; AVX:       # %bb.0:
2993; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2994; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2995; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2996; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2997; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
2998; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551360]
2999; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
3000; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3001; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3002; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
3003; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3004; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3005; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3006; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3007; AVX-NEXT:    retq
3008;
3009; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3010; AVX2:       # %bb.0:
3011; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3012; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
3013; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
3014; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3015; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3016; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero
3017; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
3018; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3019; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3020; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3021; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3022; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3023; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3024; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3025; AVX2-NEXT:    vzeroupper
3026; AVX2-NEXT:    retq
3027;
3028; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3029; AVX512F:       # %bb.0:
3030; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3031; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3032; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3033; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3034; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
3035; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3036; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm3
3037; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm2
3038; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
3039; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3040; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
3041; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3042; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3043; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3044; AVX512F-NEXT:    vzeroupper
3045; AVX512F-NEXT:    retq
3046;
3047; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3048; AVX512DQ:       # %bb.0:
3049; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3050; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3051; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3052; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3053; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
3054; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3055; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm3
3056; AVX512DQ-NEXT:    vpandn %ymm3, %ymm2, %ymm2
3057; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
3058; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3059; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
3060; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3061; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3062; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3063; AVX512DQ-NEXT:    vzeroupper
3064; AVX512DQ-NEXT:    retq
3065;
3066; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3067; AVX512BW:       # %bb.0:
3068; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3069; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3070; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3071; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm2
3072; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
3073; AVX512BW-NEXT:    kmovd %eax, %k1
3074; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
3075; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3076; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3077; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3078; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3079; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3080; AVX512BW-NEXT:    vzeroupper
3081; AVX512BW-NEXT:    retq
3082  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3083  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3084  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3085  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3086  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3087  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3088  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3089  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3090  ret void
3091}
3092
3093define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3094; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3095; SSE2:       # %bb.0:
3096; SSE2-NEXT:    movdqa (%rdi), %xmm0
3097; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3098; SSE2-NEXT:    paddb (%rsi), %xmm0
3099; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3100; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3101; SSE2-NEXT:    pand %xmm2, %xmm1
3102; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3103; SSE2-NEXT:    pandn %xmm3, %xmm2
3104; SSE2-NEXT:    por %xmm1, %xmm2
3105; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
3106; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3107; SSE2-NEXT:    movdqa %xmm0, %xmm1
3108; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
3109; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3110; SSE2-NEXT:    paddb (%rdx), %xmm2
3111; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3112; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3113; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3114; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3115; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3116; SSE2-NEXT:    retq
3117;
3118; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3119; SSE42:       # %bb.0:
3120; SSE42-NEXT:    movdqa (%rdi), %xmm1
3121; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
3122; SSE42-NEXT:    paddb 48(%rsi), %xmm2
3123; SSE42-NEXT:    paddb (%rsi), %xmm1
3124; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
3125; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3126; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
3127; SSE42-NEXT:    movdqa %xmm1, %xmm0
3128; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3129; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
3130; SSE42-NEXT:    paddb (%rdx), %xmm3
3131; SSE42-NEXT:    paddb 16(%rdx), %xmm1
3132; SSE42-NEXT:    paddb 32(%rdx), %xmm0
3133; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
3134; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
3135; SSE42-NEXT:    movdqa %xmm3, (%rcx)
3136; SSE42-NEXT:    retq
3137;
3138; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3139; AVX:       # %bb.0:
3140; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3141; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3142; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3143; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3144; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
3145; AVX-NEXT:    vpmovsxwd {{.*#+}} xmm3 = [4294967040,4294967295,4294967295,4294967040]
3146; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
3147; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3148; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3149; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3150; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3151; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3152; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3153; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3154; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3155; AVX-NEXT:    retq
3156;
3157; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3158; AVX2:       # %bb.0:
3159; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
3160; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
3161; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
3162; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
3163; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm2
3164; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
3165; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3166; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3167; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3168; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
3169; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
3170; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3171; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3172; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
3173; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
3174; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3175; AVX2-NEXT:    vzeroupper
3176; AVX2-NEXT:    retq
3177;
3178; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3179; AVX512F:       # %bb.0:
3180; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3181; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3182; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3183; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3184; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3185; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3186; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm2
3187; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3188; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3189; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3190; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3191; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3192; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3193; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3194; AVX512F-NEXT:    vzeroupper
3195; AVX512F-NEXT:    retq
3196;
3197; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3198; AVX512DQ:       # %bb.0:
3199; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3200; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3201; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3202; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3203; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3204; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3205; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm2
3206; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3207; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3208; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3209; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3210; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3211; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3212; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3213; AVX512DQ-NEXT:    vzeroupper
3214; AVX512DQ-NEXT:    retq
3215;
3216; AVX512BW-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3217; AVX512BW:       # %bb.0:
3218; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3219; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3220; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3221; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3222; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3223; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm2
3224; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
3225; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3226; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3227; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3228; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3229; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3230; AVX512BW-NEXT:    vzeroupper
3231; AVX512BW-NEXT:    retq
3232  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3233  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3234  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3235  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3236  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3237  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3238  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3239  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3240  ret void
3241}
3242
3243define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3244; SSE2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3245; SSE2:       # %bb.0:
3246; SSE2-NEXT:    movdqa (%rdi), %xmm0
3247; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3248; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3249; SSE2-NEXT:    paddb (%rsi), %xmm0
3250; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3251; SSE2-NEXT:    pand %xmm2, %xmm1
3252; SSE2-NEXT:    pandn %xmm0, %xmm2
3253; SSE2-NEXT:    por %xmm1, %xmm2
3254; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3255; SSE2-NEXT:    paddb (%rdx), %xmm2
3256; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3257; SSE2-NEXT:    paddb %xmm0, %xmm1
3258; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3259; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3260; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3261; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3262; SSE2-NEXT:    retq
3263;
3264; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3265; SSE42:       # %bb.0:
3266; SSE42-NEXT:    movdqa (%rdi), %xmm1
3267; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
3268; SSE42-NEXT:    paddb 48(%rsi), %xmm2
3269; SSE42-NEXT:    paddb (%rsi), %xmm1
3270; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3271; SSE42-NEXT:    movdqa %xmm1, %xmm3
3272; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
3273; SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3274; SSE42-NEXT:    paddb (%rdx), %xmm3
3275; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
3276; SSE42-NEXT:    paddb %xmm1, %xmm0
3277; SSE42-NEXT:    paddb 32(%rdx), %xmm1
3278; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
3279; SSE42-NEXT:    movdqa %xmm3, (%rcx)
3280; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3281; SSE42-NEXT:    retq
3282;
3283; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3284; AVX:       # %bb.0:
3285; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3286; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3287; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3288; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3289; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
3290; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3291; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3292; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3293; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
3294; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3295; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3296; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3297; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3298; AVX-NEXT:    retq
3299;
3300; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3301; AVX2:       # %bb.0:
3302; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3303; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
3304; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
3305; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3306; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3307; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm3 = [255,0,255,0]
3308; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
3309; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
3310; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
3311; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
3312; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
3313; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3314; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3315; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3316; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3317; AVX2-NEXT:    vzeroupper
3318; AVX2-NEXT:    retq
3319;
3320; AVX512F-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3321; AVX512F:       # %bb.0:
3322; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
3323; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3324; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3325; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3326; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3327; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
3328; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
3329; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1]
3330; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm2
3331; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
3332; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3333; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
3334; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3335; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3336; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3337; AVX512F-NEXT:    vzeroupper
3338; AVX512F-NEXT:    retq
3339;
3340; AVX512DQ-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3341; AVX512DQ:       # %bb.0:
3342; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
3343; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3344; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3345; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3346; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3347; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
3348; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
3349; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1]
3350; AVX512DQ-NEXT:    vpandn %ymm3, %ymm2, %ymm2
3351; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
3352; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3353; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
3354; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3355; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3356; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3357; AVX512DQ-NEXT:    vzeroupper
3358; AVX512DQ-NEXT:    retq
3359;
3360; AVX512BW-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3361; AVX512BW:       # %bb.0:
3362; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3363; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3364; AVX512BW-NEXT:    movw $1, %ax
3365; AVX512BW-NEXT:    kmovd %eax, %k1
3366; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1} {z}
3367; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
3368; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
3369; AVX512BW-NEXT:    movl $65537, %eax # imm = 0x10001
3370; AVX512BW-NEXT:    kmovd %eax, %k1
3371; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
3372; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
3373; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
3374; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3375; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3376; AVX512BW-NEXT:    vzeroupper
3377; AVX512BW-NEXT:    retq
3378  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3379  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3380  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3381  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3382  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3383  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3384  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3385  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3386  ret void
3387}
3388
3389define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3390; SSE2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3391; SSE2:       # %bb.0:
3392; SSE2-NEXT:    movdqa (%rdi), %xmm0
3393; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3394; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3395; SSE2-NEXT:    paddb (%rsi), %xmm0
3396; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3397; SSE2-NEXT:    pand %xmm2, %xmm1
3398; SSE2-NEXT:    pandn %xmm0, %xmm2
3399; SSE2-NEXT:    por %xmm1, %xmm2
3400; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
3401; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3402; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3403; SSE2-NEXT:    movaps 32(%rdx), %xmm1
3404; SSE2-NEXT:    paddb (%rdx), %xmm2
3405; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3406; SSE2-NEXT:    movaps %xmm1, 32(%rcx)
3407; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3408; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3409; SSE2-NEXT:    retq
3410;
3411; SSE42-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3412; SSE42:       # %bb.0:
3413; SSE42-NEXT:    movdqa (%rdi), %xmm1
3414; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
3415; SSE42-NEXT:    paddb 48(%rsi), %xmm2
3416; SSE42-NEXT:    paddb (%rsi), %xmm1
3417; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3418; SSE42-NEXT:    movdqa %xmm1, %xmm3
3419; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
3420; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
3421; SSE42-NEXT:    movaps 32(%rdx), %xmm0
3422; SSE42-NEXT:    paddb (%rdx), %xmm3
3423; SSE42-NEXT:    paddb 16(%rdx), %xmm1
3424; SSE42-NEXT:    movaps %xmm0, 32(%rcx)
3425; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
3426; SSE42-NEXT:    movdqa %xmm3, (%rcx)
3427; SSE42-NEXT:    retq
3428;
3429; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3430; AVX:       # %bb.0:
3431; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3432; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3433; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3434; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3435; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
3436; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3437; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3438; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
3439; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3440; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3441; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
3442; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3443; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3444; AVX-NEXT:    vzeroupper
3445; AVX-NEXT:    retq
3446;
3447; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3448; AVX2:       # %bb.0:
3449; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
3450; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
3451; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
3452; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
3453; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm2
3454; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm3 = [255,0,18446744073709551615,18446744073709551360]
3455; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3456; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
3457; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
3458; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3459; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
3460; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3461; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
3462; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3463; AVX2-NEXT:    vzeroupper
3464; AVX2-NEXT:    retq
3465;
3466; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3467; AVX512F:       # %bb.0:
3468; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3469; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3470; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3471; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3472; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0))
3473; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
3474; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3475; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3476; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3477; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
3478; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
3479; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3480; AVX512F-NEXT:    vzeroupper
3481; AVX512F-NEXT:    retq
3482;
3483; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3484; AVX512DQ:       # %bb.0:
3485; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3486; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3487; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3488; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3489; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0))
3490; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm0
3491; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3492; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3493; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3494; AVX512DQ-NEXT:    vmovaps 32(%rdx), %ymm1
3495; AVX512DQ-NEXT:    vmovaps %ymm1, 32(%rcx)
3496; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
3497; AVX512DQ-NEXT:    vzeroupper
3498; AVX512DQ-NEXT:    retq
3499;
3500; AVX512BW-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3501; AVX512BW:       # %bb.0:
3502; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3503; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3504; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3505; AVX512BW-NEXT:    movw $1, %ax
3506; AVX512BW-NEXT:    kmovd %eax, %k1
3507; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
3508; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm0
3509; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3510; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3511; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3512; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3513; AVX512BW-NEXT:    vzeroupper
3514; AVX512BW-NEXT:    retq
3515  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3516  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3517  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3518  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3519  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3520  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3521  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3522  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3523  ret void
3524}
3525
3526define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3527; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3528; SSE2:       # %bb.0:
3529; SSE2-NEXT:    movdqa (%rdi), %xmm0
3530; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3531; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3532; SSE2-NEXT:    paddb (%rsi), %xmm0
3533; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3534; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
3535; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
3536; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3537; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
3538; SSE2-NEXT:    movdqa %xmm0, %xmm2
3539; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3540; SSE2-NEXT:    pxor %xmm1, %xmm1
3541; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3542; SSE2-NEXT:    paddb (%rdx), %xmm2
3543; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3544; SSE2-NEXT:    paddb %xmm0, %xmm1
3545; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3546; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3547; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3548; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3549; SSE2-NEXT:    retq
3550;
3551; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3552; SSE42:       # %bb.0:
3553; SSE42-NEXT:    movdqa (%rdi), %xmm0
3554; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3555; SSE42-NEXT:    paddb (%rsi), %xmm0
3556; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3557; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3558; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3559; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3560; SSE42-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3561; SSE42-NEXT:    paddb (%rdx), %xmm0
3562; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
3563; SSE42-NEXT:    paddb %xmm2, %xmm1
3564; SSE42-NEXT:    paddb 32(%rdx), %xmm2
3565; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
3566; SSE42-NEXT:    movdqa %xmm0, (%rcx)
3567; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
3568; SSE42-NEXT:    retq
3569;
3570; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3571; AVX:       # %bb.0:
3572; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3573; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3574; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3575; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3576; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3577; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
3578; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3579; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3580; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
3581; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3582; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
3583; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
3584; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3585; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3586; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3587; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3588; AVX-NEXT:    retq
3589;
3590; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3591; AVX2:       # %bb.0:
3592; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3593; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3594; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3595; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3596; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
3597; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3598; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3599; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3600; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3601; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3602; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3603; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3604; AVX2-NEXT:    vzeroupper
3605; AVX2-NEXT:    retq
3606;
3607; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3608; AVX512F:       # %bb.0:
3609; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3610; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3611; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3612; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3613; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
3614; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3615; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3616; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3617; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3618; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3619; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3620; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3621; AVX512F-NEXT:    vzeroupper
3622; AVX512F-NEXT:    retq
3623;
3624; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3625; AVX512DQ:       # %bb.0:
3626; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3627; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3628; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3629; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3630; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %ymm0
3631; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3632; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3633; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3634; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3635; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3636; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3637; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3638; AVX512DQ-NEXT:    vzeroupper
3639; AVX512DQ-NEXT:    retq
3640;
3641; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3642; AVX512BW-SLOW:       # %bb.0:
3643; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
3644; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
3645; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3646; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3647; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
3648; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
3649; AVX512BW-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3650; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3651; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3652; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3653; AVX512BW-SLOW-NEXT:    vzeroupper
3654; AVX512BW-SLOW-NEXT:    retq
3655;
3656; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3657; AVX512BW-FAST:       # %bb.0:
3658; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
3659; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
3660; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3661; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3662; AVX512BW-FAST-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
3663; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
3664; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3665; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3666; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
3667; AVX512BW-FAST-NEXT:    vzeroupper
3668; AVX512BW-FAST-NEXT:    retq
3669  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3670  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3671  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3672  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3673  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31, i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47>
3674  %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
3675  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3676  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3677  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3678  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3679  ret void
3680}
3681
3682define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3683; SSE2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3684; SSE2:       # %bb.0:
3685; SSE2-NEXT:    movdqa (%rdi), %xmm0
3686; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3687; SSE2-NEXT:    paddb (%rsi), %xmm0
3688; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3689; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
3690; SSE2-NEXT:    pand %xmm2, %xmm1
3691; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3692; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3693; SSE2-NEXT:    pandn %xmm0, %xmm2
3694; SSE2-NEXT:    por %xmm1, %xmm2
3695; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0]
3696; SSE2-NEXT:    pand %xmm0, %xmm1
3697; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3698; SSE2-NEXT:    paddb (%rdx), %xmm2
3699; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3700; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3701; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3702; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3703; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3704; SSE2-NEXT:    retq
3705;
3706; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3707; SSE42:       # %bb.0:
3708; SSE42-NEXT:    movdqa (%rdi), %xmm0
3709; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3710; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3711; SSE42-NEXT:    paddb (%rsi), %xmm0
3712; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3713; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3714; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3715; SSE42-NEXT:    pxor %xmm2, %xmm2
3716; SSE42-NEXT:    pxor %xmm3, %xmm3
3717; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
3718; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
3719; SSE42-NEXT:    paddb (%rdx), %xmm1
3720; SSE42-NEXT:    paddb 16(%rdx), %xmm2
3721; SSE42-NEXT:    paddb 32(%rdx), %xmm3
3722; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
3723; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3724; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
3725; SSE42-NEXT:    retq
3726;
3727; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3728; AVX:       # %bb.0:
3729; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3730; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3731; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3732; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3733; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3734; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3735; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3736; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3737; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3738; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3739; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
3740; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
3741; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3742; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3743; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3744; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3745; AVX-NEXT:    retq
3746;
3747; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3748; AVX2:       # %bb.0:
3749; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3750; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3751; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3752; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3753; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
3754; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3755; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3756; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3757; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3758; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3759; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3760; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3761; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3762; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3763; AVX2-NEXT:    vzeroupper
3764; AVX2-NEXT:    retq
3765;
3766; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3767; AVX512F-SLOW:       # %bb.0:
3768; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
3769; AVX512F-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
3770; AVX512F-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3771; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3772; AVX512F-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
3773; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3774; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3775; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15]
3776; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3777; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3778; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3779; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3780; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3781; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
3782; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
3783; AVX512F-SLOW-NEXT:    vzeroupper
3784; AVX512F-SLOW-NEXT:    retq
3785;
3786; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3787; AVX512F-FAST:       # %bb.0:
3788; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
3789; AVX512F-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
3790; AVX512F-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3791; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3792; AVX512F-FAST-NEXT:    vpbroadcastw %xmm0, %ymm2
3793; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
3794; AVX512F-FAST-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3795; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15]
3796; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3797; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3798; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3799; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3800; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
3801; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
3802; AVX512F-FAST-NEXT:    vzeroupper
3803; AVX512F-FAST-NEXT:    retq
3804;
3805; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3806; AVX512DQ-SLOW:       # %bb.0:
3807; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
3808; AVX512DQ-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
3809; AVX512DQ-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3810; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3811; AVX512DQ-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
3812; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3813; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3814; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15]
3815; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3816; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3817; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3818; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3819; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3820; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
3821; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
3822; AVX512DQ-SLOW-NEXT:    vzeroupper
3823; AVX512DQ-SLOW-NEXT:    retq
3824;
3825; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3826; AVX512DQ-FAST:       # %bb.0:
3827; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
3828; AVX512DQ-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
3829; AVX512DQ-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3830; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3831; AVX512DQ-FAST-NEXT:    vpbroadcastw %xmm0, %ymm2
3832; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
3833; AVX512DQ-FAST-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3834; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15]
3835; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3836; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3837; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3838; AVX512DQ-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3839; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
3840; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
3841; AVX512DQ-FAST-NEXT:    vzeroupper
3842; AVX512DQ-FAST-NEXT:    retq
3843;
3844; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3845; AVX512BW-SLOW:       # %bb.0:
3846; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
3847; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
3848; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3849; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3850; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
3851; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
3852; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3853; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3854; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3855; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3856; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3857; AVX512BW-SLOW-NEXT:    vzeroupper
3858; AVX512BW-SLOW-NEXT:    retq
3859;
3860; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3861; AVX512BW-FAST:       # %bb.0:
3862; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
3863; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
3864; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3865; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3866; AVX512BW-FAST-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
3867; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3868; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3869; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3870; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
3871; AVX512BW-FAST-NEXT:    vzeroupper
3872; AVX512BW-FAST-NEXT:    retq
3873  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3874  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3875  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3876  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3877  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 0, i32 28, i32 29, i32 0, i32 31, i32 32, i32 0, i32 34, i32 35, i32 0, i32 37, i32 38, i32 0, i32 40, i32 41, i32 0, i32 43, i32 44, i32 0, i32 46, i32 47>
3878  %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
3879  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3880  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3881  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3882  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3883  ret void
3884}
3885
3886define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3887; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3888; SSE2:       # %bb.0:
3889; SSE2-NEXT:    movdqa (%rdi), %xmm0
3890; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3891; SSE2-NEXT:    paddb (%rsi), %xmm0
3892; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3893; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3894; SSE2-NEXT:    pand %xmm2, %xmm1
3895; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3896; SSE2-NEXT:    pandn %xmm0, %xmm2
3897; SSE2-NEXT:    por %xmm1, %xmm2
3898; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3899; SSE2-NEXT:    paddb (%rdx), %xmm2
3900; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3901; SSE2-NEXT:    paddb %xmm0, %xmm1
3902; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3903; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3904; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3905; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3906; SSE2-NEXT:    retq
3907;
3908; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3909; SSE42:       # %bb.0:
3910; SSE42-NEXT:    movdqa (%rdi), %xmm0
3911; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3912; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3913; SSE42-NEXT:    paddb (%rsi), %xmm0
3914; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3915; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3916; SSE42-NEXT:    pxor %xmm2, %xmm2
3917; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3918; SSE42-NEXT:    paddb (%rdx), %xmm1
3919; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
3920; SSE42-NEXT:    paddb %xmm2, %xmm0
3921; SSE42-NEXT:    paddb 32(%rdx), %xmm2
3922; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
3923; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3924; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3925; SSE42-NEXT:    retq
3926;
3927; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3928; AVX:       # %bb.0:
3929; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3930; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3931; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3932; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3933; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3934; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3935; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3936; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3937; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3938; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
3939; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
3940; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3941; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3942; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3943; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3944; AVX-NEXT:    retq
3945;
3946; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3947; AVX2-SLOW:       # %bb.0:
3948; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
3949; AVX2-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
3950; AVX2-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3951; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3952; AVX2-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm2
3953; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3954; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3955; AVX2-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
3956; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3957; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3958; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3959; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3960; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
3961; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
3962; AVX2-SLOW-NEXT:    vzeroupper
3963; AVX2-SLOW-NEXT:    retq
3964;
3965; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3966; AVX2-FAST-PERLANE:       # %bb.0:
3967; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
3968; AVX2-FAST-PERLANE-NEXT:    vmovdqa 48(%rdi), %xmm1
3969; AVX2-FAST-PERLANE-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3970; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3971; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq %xmm0, %ymm2
3972; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3973; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3974; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3975; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3976; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3977; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
3978; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
3979; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3980; AVX2-FAST-PERLANE-NEXT:    retq
3981;
3982; AVX2-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3983; AVX2-FAST:       # %bb.0:
3984; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
3985; AVX2-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
3986; AVX2-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3987; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3988; AVX2-FAST-NEXT:    vpbroadcastq %xmm0, %ymm2
3989; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3990; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3991; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3992; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3993; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3994; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
3995; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
3996; AVX2-FAST-NEXT:    vzeroupper
3997; AVX2-FAST-NEXT:    retq
3998;
3999; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4000; AVX512F-SLOW:       # %bb.0:
4001; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4002; AVX512F-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
4003; AVX512F-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4004; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4005; AVX512F-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm2
4006; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4007; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4008; AVX512F-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
4009; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4010; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
4011; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4012; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4013; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4014; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4015; AVX512F-SLOW-NEXT:    vzeroupper
4016; AVX512F-SLOW-NEXT:    retq
4017;
4018; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4019; AVX512F-FAST:       # %bb.0:
4020; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4021; AVX512F-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
4022; AVX512F-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4023; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4024; AVX512F-FAST-NEXT:    vpbroadcastq %xmm0, %ymm2
4025; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4026; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4027; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4028; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4029; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4030; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4031; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4032; AVX512F-FAST-NEXT:    vzeroupper
4033; AVX512F-FAST-NEXT:    retq
4034;
4035; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4036; AVX512DQ-SLOW:       # %bb.0:
4037; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4038; AVX512DQ-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
4039; AVX512DQ-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4040; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4041; AVX512DQ-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm2
4042; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4043; AVX512DQ-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4044; AVX512DQ-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
4045; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4046; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
4047; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4048; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4049; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4050; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4051; AVX512DQ-SLOW-NEXT:    vzeroupper
4052; AVX512DQ-SLOW-NEXT:    retq
4053;
4054; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4055; AVX512DQ-FAST:       # %bb.0:
4056; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4057; AVX512DQ-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
4058; AVX512DQ-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4059; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4060; AVX512DQ-FAST-NEXT:    vpbroadcastq %xmm0, %ymm2
4061; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4062; AVX512DQ-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4063; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4064; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4065; AVX512DQ-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4066; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4067; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4068; AVX512DQ-FAST-NEXT:    vzeroupper
4069; AVX512DQ-FAST-NEXT:    retq
4070;
4071; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4072; AVX512BW-SLOW:       # %bb.0:
4073; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4074; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
4075; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4076; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4077; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
4078; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
4079; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4080; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
4081; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4082; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4083; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4084; AVX512BW-SLOW-NEXT:    vzeroupper
4085; AVX512BW-SLOW-NEXT:    retq
4086;
4087; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4088; AVX512BW-FAST:       # %bb.0:
4089; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4090; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
4091; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4092; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4093; AVX512BW-FAST-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
4094; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4095; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4096; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4097; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4098; AVX512BW-FAST-NEXT:    vzeroupper
4099; AVX512BW-FAST-NEXT:    retq
4100  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4101  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4102  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4103  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4104  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47>
4105  %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4106  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4107  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4108  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4109  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4110  ret void
4111}
4112
4113define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4114; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4115; SSE2:       # %bb.0:
4116; SSE2-NEXT:    movdqa (%rdi), %xmm0
4117; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4118; SSE2-NEXT:    paddb (%rsi), %xmm0
4119; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4120; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
4121; SSE2-NEXT:    pand %xmm2, %xmm1
4122; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
4123; SSE2-NEXT:    pandn %xmm3, %xmm2
4124; SSE2-NEXT:    por %xmm1, %xmm2
4125; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
4126; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4127; SSE2-NEXT:    movdqa %xmm0, %xmm1
4128; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
4129; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4130; SSE2-NEXT:    paddb (%rdx), %xmm2
4131; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4132; SSE2-NEXT:    paddb 32(%rdx), %xmm1
4133; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
4134; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4135; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4136; SSE2-NEXT:    retq
4137;
4138; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4139; SSE42:       # %bb.0:
4140; SSE42-NEXT:    movdqa (%rdi), %xmm0
4141; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4142; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4143; SSE42-NEXT:    paddb (%rsi), %xmm0
4144; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4145; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4146; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4147; SSE42-NEXT:    pxor %xmm3, %xmm3
4148; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
4149; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4150; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7]
4151; SSE42-NEXT:    paddb (%rdx), %xmm2
4152; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4153; SSE42-NEXT:    paddb 32(%rdx), %xmm1
4154; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
4155; SSE42-NEXT:    movdqa %xmm2, (%rcx)
4156; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4157; SSE42-NEXT:    retq
4158;
4159; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4160; AVX:       # %bb.0:
4161; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4162; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4163; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4164; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4165; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4166; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4167; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4168; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4169; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4170; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
4171; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4172; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4173; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
4174; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4175; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4176; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
4177; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4178; AVX-NEXT:    retq
4179;
4180; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4181; AVX2:       # %bb.0:
4182; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
4183; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4184; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4185; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4186; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
4187; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
4188; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
4189; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4190; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4191; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4192; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4193; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4194; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
4195; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
4196; AVX2-NEXT:    vzeroupper
4197; AVX2-NEXT:    retq
4198;
4199; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4200; AVX512F-SLOW:       # %bb.0:
4201; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4202; AVX512F-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
4203; AVX512F-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4204; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4205; AVX512F-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
4206; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
4207; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4208; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4209; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4210; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4211; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4212; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4213; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4214; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4215; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4216; AVX512F-SLOW-NEXT:    vzeroupper
4217; AVX512F-SLOW-NEXT:    retq
4218;
4219; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4220; AVX512F-FAST:       # %bb.0:
4221; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4222; AVX512F-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
4223; AVX512F-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4224; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4225; AVX512F-FAST-NEXT:    vpbroadcastw %xmm0, %ymm2
4226; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4227; AVX512F-FAST-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4228; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
4229; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4230; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4231; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4232; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4233; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4234; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4235; AVX512F-FAST-NEXT:    vzeroupper
4236; AVX512F-FAST-NEXT:    retq
4237;
4238; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4239; AVX512DQ-SLOW:       # %bb.0:
4240; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4241; AVX512DQ-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
4242; AVX512DQ-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4243; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4244; AVX512DQ-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
4245; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
4246; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4247; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4248; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4249; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4250; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4251; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4252; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4253; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4254; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4255; AVX512DQ-SLOW-NEXT:    vzeroupper
4256; AVX512DQ-SLOW-NEXT:    retq
4257;
4258; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4259; AVX512DQ-FAST:       # %bb.0:
4260; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4261; AVX512DQ-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
4262; AVX512DQ-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4263; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4264; AVX512DQ-FAST-NEXT:    vpbroadcastw %xmm0, %ymm2
4265; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4266; AVX512DQ-FAST-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4267; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
4268; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4269; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4270; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4271; AVX512DQ-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4272; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4273; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4274; AVX512DQ-FAST-NEXT:    vzeroupper
4275; AVX512DQ-FAST-NEXT:    retq
4276;
4277; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4278; AVX512BW-SLOW:       # %bb.0:
4279; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4280; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
4281; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4282; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4283; AVX512BW-SLOW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
4284; AVX512BW-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm0
4285; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4286; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4287; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4288; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4289; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4290; AVX512BW-SLOW-NEXT:    vzeroupper
4291; AVX512BW-SLOW-NEXT:    retq
4292;
4293; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4294; AVX512BW-FAST:       # %bb.0:
4295; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4296; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
4297; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4298; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4299; AVX512BW-FAST-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
4300; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4301; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4302; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4303; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4304; AVX512BW-FAST-NEXT:    vzeroupper
4305; AVX512BW-FAST-NEXT:    retq
4306  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4307  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4308  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4309  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4310  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 0, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 0, i32 43, i32 44, i32 45, i32 46, i32 47>
4311  %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4312  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4313  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4314  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4315  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4316  ret void
4317}
4318
4319define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4320; SSE2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4321; SSE2:       # %bb.0:
4322; SSE2-NEXT:    movdqa (%rdi), %xmm0
4323; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4324; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4325; SSE2-NEXT:    paddb (%rsi), %xmm0
4326; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
4327; SSE2-NEXT:    pand %xmm2, %xmm1
4328; SSE2-NEXT:    pandn %xmm0, %xmm2
4329; SSE2-NEXT:    por %xmm1, %xmm2
4330; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4331; SSE2-NEXT:    paddb (%rdx), %xmm2
4332; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
4333; SSE2-NEXT:    paddb %xmm0, %xmm1
4334; SSE2-NEXT:    paddb 32(%rdx), %xmm0
4335; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
4336; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
4337; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4338; SSE2-NEXT:    retq
4339;
4340; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4341; SSE42:       # %bb.0:
4342; SSE42-NEXT:    movdqa (%rdi), %xmm0
4343; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4344; SSE42-NEXT:    paddb (%rsi), %xmm0
4345; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4346; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4347; SSE42-NEXT:    pxor %xmm2, %xmm2
4348; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4349; SSE42-NEXT:    paddb (%rdx), %xmm1
4350; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
4351; SSE42-NEXT:    paddb %xmm2, %xmm0
4352; SSE42-NEXT:    paddb 32(%rdx), %xmm2
4353; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
4354; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4355; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4356; SSE42-NEXT:    retq
4357;
4358; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4359; AVX:       # %bb.0:
4360; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4361; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4362; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4363; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4364; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4365; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4366; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4367; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4368; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
4369; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
4370; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4371; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4372; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4373; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
4374; AVX-NEXT:    retq
4375;
4376; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4377; AVX2:       # %bb.0:
4378; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4379; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4380; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4381; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4382; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4383; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4384; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4385; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4386; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4387; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4388; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4389; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
4390; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
4391; AVX2-NEXT:    vzeroupper
4392; AVX2-NEXT:    retq
4393;
4394; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4395; AVX512F:       # %bb.0:
4396; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4397; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4398; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
4399; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4400; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4401; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4402; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4403; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4404; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4405; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4406; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4407; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
4408; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
4409; AVX512F-NEXT:    vzeroupper
4410; AVX512F-NEXT:    retq
4411;
4412; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4413; AVX512DQ:       # %bb.0:
4414; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4415; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4416; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
4417; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4418; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4419; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4420; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4421; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4422; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4423; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4424; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4425; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
4426; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
4427; AVX512DQ-NEXT:    vzeroupper
4428; AVX512DQ-NEXT:    retq
4429;
4430; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4431; AVX512BW:       # %bb.0:
4432; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4433; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47]
4434; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4435; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4436; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
4437; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4438; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4439; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4440; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4441; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4442; AVX512BW-NEXT:    vzeroupper
4443; AVX512BW-NEXT:    retq
4444  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4445  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4446  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4447  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4448  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4449  %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4450  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4451  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4452  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4453  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4454  ret void
4455}
4456
4457define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4458; SSE2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4459; SSE2:       # %bb.0:
4460; SSE2-NEXT:    movdqa (%rdi), %xmm0
4461; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4462; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4463; SSE2-NEXT:    paddb (%rsi), %xmm0
4464; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
4465; SSE2-NEXT:    pand %xmm2, %xmm1
4466; SSE2-NEXT:    pandn %xmm0, %xmm2
4467; SSE2-NEXT:    por %xmm1, %xmm2
4468; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
4469; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4470; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4471; SSE2-NEXT:    movaps 32(%rdx), %xmm1
4472; SSE2-NEXT:    paddb (%rdx), %xmm2
4473; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4474; SSE2-NEXT:    movaps %xmm1, 32(%rcx)
4475; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4476; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4477; SSE2-NEXT:    retq
4478;
4479; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4480; SSE42:       # %bb.0:
4481; SSE42-NEXT:    movdqa (%rdi), %xmm0
4482; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4483; SSE42-NEXT:    paddb (%rsi), %xmm0
4484; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4485; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4486; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4487; SSE42-NEXT:    pxor %xmm2, %xmm2
4488; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
4489; SSE42-NEXT:    movaps 32(%rdx), %xmm0
4490; SSE42-NEXT:    paddb (%rdx), %xmm1
4491; SSE42-NEXT:    paddb 16(%rdx), %xmm2
4492; SSE42-NEXT:    movaps %xmm0, 32(%rcx)
4493; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4494; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
4495; SSE42-NEXT:    retq
4496;
4497; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4498; AVX:       # %bb.0:
4499; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4500; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4501; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4502; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4503; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4504; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4505; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
4506; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4507; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
4508; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4509; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
4510; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4511; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4512; AVX-NEXT:    vzeroupper
4513; AVX-NEXT:    retq
4514;
4515; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4516; AVX2:       # %bb.0:
4517; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
4518; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4519; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4520; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4521; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4522; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
4523; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4524; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4525; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4526; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
4527; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4528; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
4529; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4530; AVX2-NEXT:    vzeroupper
4531; AVX2-NEXT:    retq
4532;
4533; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4534; AVX512F:       # %bb.0:
4535; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
4536; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
4537; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4538; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4539; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4540; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
4541; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4542; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4543; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4544; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4545; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
4546; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
4547; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4548; AVX512F-NEXT:    vzeroupper
4549; AVX512F-NEXT:    retq
4550;
4551; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4552; AVX512DQ:       # %bb.0:
4553; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
4554; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
4555; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4556; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4557; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4558; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %ymm0
4559; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4560; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4561; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4562; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4563; AVX512DQ-NEXT:    vmovaps 32(%rdx), %ymm1
4564; AVX512DQ-NEXT:    vmovaps %ymm1, 32(%rcx)
4565; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4566; AVX512DQ-NEXT:    vzeroupper
4567; AVX512DQ-NEXT:    retq
4568;
4569; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4570; AVX512BW:       # %bb.0:
4571; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4572; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47]
4573; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4574; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4575; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
4576; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
4577; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4578; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4579; AVX512BW-NEXT:    vzeroupper
4580; AVX512BW-NEXT:    retq
4581  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4582  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4583  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4584  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4585  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4586  %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4587  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4588  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4589  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4590  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4591  ret void
4592}
4593
4594define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4595; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4596; SSE2:       # %bb.0:
4597; SSE2-NEXT:    movdqa (%rdi), %xmm0
4598; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4599; SSE2-NEXT:    paddb (%rsi), %xmm0
4600; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4601; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
4602; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4603; SSE2-NEXT:    movdqa %xmm0, %xmm2
4604; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4605; SSE2-NEXT:    pxor %xmm1, %xmm1
4606; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4607; SSE2-NEXT:    paddb (%rdx), %xmm2
4608; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
4609; SSE2-NEXT:    paddb %xmm0, %xmm1
4610; SSE2-NEXT:    paddb 32(%rdx), %xmm0
4611; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
4612; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4613; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
4614; SSE2-NEXT:    retq
4615;
4616; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4617; SSE42:       # %bb.0:
4618; SSE42-NEXT:    movdqa (%rdi), %xmm0
4619; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4620; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4621; SSE42-NEXT:    paddb (%rsi), %xmm0
4622; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4623; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
4624; SSE42-NEXT:    pxor %xmm2, %xmm2
4625; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
4626; SSE42-NEXT:    paddb (%rdx), %xmm1
4627; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
4628; SSE42-NEXT:    paddb %xmm2, %xmm0
4629; SSE42-NEXT:    paddb 32(%rdx), %xmm2
4630; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
4631; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4632; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4633; SSE42-NEXT:    retq
4634;
4635; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4636; AVX:       # %bb.0:
4637; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4638; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4639; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4640; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4641; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm2
4642; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7]
4643; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3]
4644; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4645; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4646; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
4647; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
4648; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4649; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
4650; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
4651; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4652; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4653; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
4654; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
4655; AVX-NEXT:    vzeroupper
4656; AVX-NEXT:    retq
4657;
4658; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4659; AVX2-SLOW:       # %bb.0:
4660; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
4661; AVX2-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm1
4662; AVX2-SLOW-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4663; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4664; AVX2-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm2
4665; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7]
4666; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4667; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4668; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4669; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4670; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
4671; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4672; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4673; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4674; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4675; AVX2-SLOW-NEXT:    vzeroupper
4676; AVX2-SLOW-NEXT:    retq
4677;
4678; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4679; AVX2-FAST-PERLANE:       # %bb.0:
4680; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
4681; AVX2-FAST-PERLANE-NEXT:    vmovdqa 48(%rdi), %xmm1
4682; AVX2-FAST-PERLANE-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4683; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4684; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq %xmm0, %ymm2
4685; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7]
4686; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4687; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4688; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
4689; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4690; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4691; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
4692; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
4693; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4694; AVX2-FAST-PERLANE-NEXT:    retq
4695;
4696; AVX2-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4697; AVX2-FAST:       # %bb.0:
4698; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
4699; AVX2-FAST-NEXT:    vmovdqa 48(%rdi), %xmm1
4700; AVX2-FAST-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4701; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4702; AVX2-FAST-NEXT:    vpbroadcastq %xmm0, %ymm2
4703; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7]
4704; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4705; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4706; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
4707; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4708; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4709; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4710; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4711; AVX2-FAST-NEXT:    vzeroupper
4712; AVX2-FAST-NEXT:    retq
4713;
4714; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4715; AVX512F:       # %bb.0:
4716; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4717; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4718; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4719; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4720; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4721; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4722; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0]
4723; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
4724; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
4725; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4726; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
4727; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
4728; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
4729; AVX512F-NEXT:    vzeroupper
4730; AVX512F-NEXT:    retq
4731;
4732; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4733; AVX512DQ:       # %bb.0:
4734; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4735; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4736; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4737; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4738; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4739; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4740; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0]
4741; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
4742; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
4743; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4744; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
4745; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
4746; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
4747; AVX512DQ-NEXT:    vzeroupper
4748; AVX512DQ-NEXT:    retq
4749;
4750; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4751; AVX512BW-SLOW:       # %bb.0:
4752; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4753; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4754; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4755; AVX512BW-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
4756; AVX512BW-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4757; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4758; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
4759; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4760; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4761; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4762; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4763; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4764; AVX512BW-SLOW-NEXT:    vzeroupper
4765; AVX512BW-SLOW-NEXT:    retq
4766;
4767; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4768; AVX512BW-FAST:       # %bb.0:
4769; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4770; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4771; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4772; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
4773; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4774; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4775; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
4776; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4777; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4778; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4779; AVX512BW-FAST-NEXT:    vzeroupper
4780; AVX512BW-FAST-NEXT:    retq
4781  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4782  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4783  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4784  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4785  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 0, i32 15, i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23>
4786  %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
4787  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4788  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4789  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4790  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4791  ret void
4792}
4793
4794define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4795; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4796; SSE2:       # %bb.0:
4797; SSE2-NEXT:    movdqa (%rdi), %xmm0
4798; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4799; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4800; SSE2-NEXT:    paddb (%rsi), %xmm0
4801; SSE2-NEXT:    xorps %xmm2, %xmm2
4802; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
4803; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2]
4804; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
4805; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1]
4806; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
4807; SSE2-NEXT:    paddb (%rdx), %xmm0
4808; SSE2-NEXT:    paddb 16(%rdx), %xmm2
4809; SSE2-NEXT:    paddb 32(%rdx), %xmm1
4810; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
4811; SSE2-NEXT:    movdqa %xmm0, (%rcx)
4812; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
4813; SSE2-NEXT:    retq
4814;
4815; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4816; SSE42:       # %bb.0:
4817; SSE42-NEXT:    movdqa (%rdi), %xmm0
4818; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4819; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4820; SSE42-NEXT:    paddb (%rsi), %xmm0
4821; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4822; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
4823; SSE42-NEXT:    pxor %xmm1, %xmm1
4824; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4825; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
4826; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
4827; SSE42-NEXT:    paddb (%rdx), %xmm2
4828; SSE42-NEXT:    paddb 16(%rdx), %xmm1
4829; SSE42-NEXT:    paddb 32(%rdx), %xmm0
4830; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
4831; SSE42-NEXT:    movdqa %xmm2, (%rcx)
4832; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
4833; SSE42-NEXT:    retq
4834;
4835; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4836; AVX:       # %bb.0:
4837; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4838; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4839; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4840; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4841; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4842; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4843; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4844; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4845; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
4846; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
4847; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4848; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4849; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
4850; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
4851; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4852; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
4853; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4854; AVX-NEXT:    retq
4855;
4856; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4857; AVX2-SLOW:       # %bb.0:
4858; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
4859; AVX2-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
4860; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4861; AVX2-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4862; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4863; AVX2-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
4864; AVX2-SLOW-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4865; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4866; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4867; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4868; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4869; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
4870; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4871; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4872; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
4873; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
4874; AVX2-SLOW-NEXT:    vzeroupper
4875; AVX2-SLOW-NEXT:    retq
4876;
4877; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4878; AVX2-FAST-PERLANE:       # %bb.0:
4879; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
4880; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
4881; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4882; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4883; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4884; AVX2-FAST-PERLANE-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
4885; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4886; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4887; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4888; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
4889; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4890; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4891; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
4892; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
4893; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4894; AVX2-FAST-PERLANE-NEXT:    retq
4895;
4896; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4897; AVX2-FAST:       # %bb.0:
4898; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
4899; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
4900; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4901; AVX2-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4902; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4903; AVX2-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0]
4904; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4905; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4906; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4907; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
4908; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4909; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4910; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
4911; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
4912; AVX2-FAST-NEXT:    vzeroupper
4913; AVX2-FAST-NEXT:    retq
4914;
4915; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4916; AVX512F:       # %bb.0:
4917; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4918; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4919; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4920; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4921; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4922; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4923; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0]
4924; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
4925; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
4926; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4927; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
4928; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
4929; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
4930; AVX512F-NEXT:    vzeroupper
4931; AVX512F-NEXT:    retq
4932;
4933; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4934; AVX512DQ:       # %bb.0:
4935; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4936; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4937; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4938; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4939; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4940; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
4941; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0]
4942; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
4943; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
4944; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4945; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
4946; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
4947; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
4948; AVX512DQ-NEXT:    vzeroupper
4949; AVX512DQ-NEXT:    retq
4950;
4951; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4952; AVX512BW-SLOW:       # %bb.0:
4953; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4954; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4955; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4956; AVX512BW-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
4957; AVX512BW-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
4958; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4959; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4960; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4961; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4962; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4963; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4964; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4965; AVX512BW-SLOW-NEXT:    vzeroupper
4966; AVX512BW-SLOW-NEXT:    retq
4967;
4968; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4969; AVX512BW-FAST:       # %bb.0:
4970; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4971; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4972; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4973; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
4974; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4975; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4976; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
4977; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4978; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4979; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4980; AVX512BW-FAST-NEXT:    vzeroupper
4981; AVX512BW-FAST-NEXT:    retq
4982  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4983  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4984  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4985  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4986  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 0, i32 16, i32 17, i32 0, i32 19, i32 20, i32 0, i32 22, i32 23>
4987  %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
4988  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4989  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4990  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4991  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4992  ret void
4993}
4994
4995define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4996; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4997; SSE2:       # %bb.0:
4998; SSE2-NEXT:    movdqa (%rdi), %xmm0
4999; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
5000; SSE2-NEXT:    paddb (%rsi), %xmm0
5001; SSE2-NEXT:    paddb 48(%rsi), %xmm1
5002; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
5003; SSE2-NEXT:    xorps %xmm2, %xmm2
5004; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
5005; SSE2-NEXT:    paddb (%rdx), %xmm1
5006; SSE2-NEXT:    movdqa 16(%rdx), %xmm0
5007; SSE2-NEXT:    paddb %xmm2, %xmm0
5008; SSE2-NEXT:    paddb 32(%rdx), %xmm2
5009; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
5010; SSE2-NEXT:    movdqa %xmm1, (%rcx)
5011; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
5012; SSE2-NEXT:    retq
5013;
5014; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5015; SSE42:       # %bb.0:
5016; SSE42-NEXT:    movdqa (%rdi), %xmm0
5017; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
5018; SSE42-NEXT:    paddb (%rsi), %xmm0
5019; SSE42-NEXT:    paddb 48(%rsi), %xmm1
5020; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5021; SSE42-NEXT:    pxor %xmm2, %xmm2
5022; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5023; SSE42-NEXT:    paddb (%rdx), %xmm1
5024; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
5025; SSE42-NEXT:    paddb %xmm2, %xmm0
5026; SSE42-NEXT:    paddb 32(%rdx), %xmm2
5027; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
5028; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5029; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5030; SSE42-NEXT:    retq
5031;
5032; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5033; AVX:       # %bb.0:
5034; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5035; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
5036; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5037; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
5038; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5039; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5040; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5041; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
5042; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
5043; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5044; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
5045; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
5046; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
5047; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
5048; AVX-NEXT:    retq
5049;
5050; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5051; AVX2:       # %bb.0:
5052; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5053; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5054; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
5055; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
5056; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
5057; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
5058; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5059; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5060; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5061; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5062; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5063; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5064; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
5065; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
5066; AVX2-NEXT:    vzeroupper
5067; AVX2-NEXT:    retq
5068;
5069; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5070; AVX512F:       # %bb.0:
5071; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5072; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
5073; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5074; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5075; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5076; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5077; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0]
5078; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
5079; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
5080; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5081; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
5082; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
5083; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
5084; AVX512F-NEXT:    vzeroupper
5085; AVX512F-NEXT:    retq
5086;
5087; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5088; AVX512DQ:       # %bb.0:
5089; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5090; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
5091; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5092; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5093; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5094; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5095; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0]
5096; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
5097; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
5098; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5099; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
5100; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
5101; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
5102; AVX512DQ-NEXT:    vzeroupper
5103; AVX512DQ-NEXT:    retq
5104;
5105; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5106; AVX512BW-SLOW:       # %bb.0:
5107; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
5108; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23]
5109; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5110; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5111; AVX512BW-SLOW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
5112; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5113; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5114; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5115; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5116; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5117; AVX512BW-SLOW-NEXT:    vzeroupper
5118; AVX512BW-SLOW-NEXT:    retq
5119;
5120; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5121; AVX512BW-FAST:       # %bb.0:
5122; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
5123; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
5124; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5125; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
5126; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5127; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5128; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5129; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5130; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5131; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5132; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
5133; AVX512BW-FAST-NEXT:    vzeroupper
5134; AVX512BW-FAST-NEXT:    retq
5135  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5136  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5137  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5138  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5139  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23>
5140  %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
5141  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5142  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5143  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5144  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5145  ret void
5146}
5147
5148define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5149; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5150; SSE2:       # %bb.0:
5151; SSE2-NEXT:    movdqa (%rdi), %xmm0
5152; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
5153; SSE2-NEXT:    paddb (%rsi), %xmm0
5154; SSE2-NEXT:    paddb 48(%rsi), %xmm1
5155; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
5156; SSE2-NEXT:    xorps %xmm2, %xmm2
5157; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
5158; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
5159; SSE2-NEXT:    movaps 32(%rdx), %xmm2
5160; SSE2-NEXT:    paddb (%rdx), %xmm1
5161; SSE2-NEXT:    paddb 16(%rdx), %xmm0
5162; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
5163; SSE2-NEXT:    movdqa %xmm1, (%rcx)
5164; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
5165; SSE2-NEXT:    retq
5166;
5167; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5168; SSE42:       # %bb.0:
5169; SSE42-NEXT:    movdqa (%rdi), %xmm0
5170; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
5171; SSE42-NEXT:    paddb (%rsi), %xmm0
5172; SSE42-NEXT:    paddb 48(%rsi), %xmm1
5173; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5174; SSE42-NEXT:    pxor %xmm2, %xmm2
5175; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5176; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
5177; SSE42-NEXT:    movaps 32(%rdx), %xmm2
5178; SSE42-NEXT:    paddb (%rdx), %xmm1
5179; SSE42-NEXT:    paddb 16(%rdx), %xmm0
5180; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
5181; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5182; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5183; SSE42-NEXT:    retq
5184;
5185; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5186; AVX:       # %bb.0:
5187; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5188; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
5189; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5190; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
5191; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5192; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5193; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
5194; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
5195; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
5196; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
5197; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
5198; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
5199; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
5200; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
5201; AVX-NEXT:    vzeroupper
5202; AVX-NEXT:    retq
5203;
5204; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5205; AVX2:       # %bb.0:
5206; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5207; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
5208; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5209; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5210; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5211; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7]
5212; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
5213; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5214; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
5215; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
5216; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5217; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
5218; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5219; AVX2-NEXT:    vzeroupper
5220; AVX2-NEXT:    retq
5221;
5222; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5223; AVX512F:       # %bb.0:
5224; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5225; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
5226; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5227; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5228; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5229; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0]
5230; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5231; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
5232; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
5233; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5234; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5235; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5236; AVX512F-NEXT:    vzeroupper
5237; AVX512F-NEXT:    retq
5238;
5239; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5240; AVX512DQ:       # %bb.0:
5241; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5242; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
5243; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5244; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5245; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5246; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0]
5247; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5248; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
5249; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
5250; AVX512DQ-NEXT:    vmovaps 32(%rdx), %ymm1
5251; AVX512DQ-NEXT:    vmovaps %ymm1, 32(%rcx)
5252; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5253; AVX512DQ-NEXT:    vzeroupper
5254; AVX512DQ-NEXT:    retq
5255;
5256; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5257; AVX512BW:       # %bb.0:
5258; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5259; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5260; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
5261; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
5262; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5263; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
5264; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5265; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5266; AVX512BW-NEXT:    vzeroupper
5267; AVX512BW-NEXT:    retq
5268  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5269  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5270  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5271  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5272  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 0, i32 19, i32 20, i32 21, i32 22, i32 23>
5273  %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
5274  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5275  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5276  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5277  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5278  ret void
5279}
5280
5281define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5282; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5283; SSE2:       # %bb.0:
5284; SSE2-NEXT:    movdqa (%rdi), %xmm0
5285; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
5286; SSE2-NEXT:    paddb (%rsi), %xmm0
5287; SSE2-NEXT:    paddb 48(%rsi), %xmm1
5288; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5289; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
5290; SSE2-NEXT:    paddb (%rdx), %xmm1
5291; SSE2-NEXT:    movdqa 16(%rdx), %xmm2
5292; SSE2-NEXT:    paddb %xmm0, %xmm2
5293; SSE2-NEXT:    paddb 32(%rdx), %xmm0
5294; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
5295; SSE2-NEXT:    movdqa %xmm1, (%rcx)
5296; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
5297; SSE2-NEXT:    retq
5298;
5299; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5300; SSE42:       # %bb.0:
5301; SSE42-NEXT:    movdqa (%rdi), %xmm0
5302; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
5303; SSE42-NEXT:    paddb 48(%rsi), %xmm1
5304; SSE42-NEXT:    paddb (%rsi), %xmm0
5305; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5306; SSE42-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
5307; SSE42-NEXT:    paddb (%rdx), %xmm1
5308; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
5309; SSE42-NEXT:    paddb %xmm0, %xmm2
5310; SSE42-NEXT:    paddb 32(%rdx), %xmm0
5311; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
5312; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5313; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
5314; SSE42-NEXT:    retq
5315;
5316; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5317; AVX:       # %bb.0:
5318; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5319; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
5320; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
5321; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5322; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5323; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5324; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
5325; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5326; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
5327; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
5328; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
5329; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5330; AVX-NEXT:    retq
5331;
5332; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5333; AVX2:       # %bb.0:
5334; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5335; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
5336; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5337; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5338; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5339; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
5340; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5341; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5342; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5343; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
5344; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5345; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
5346; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
5347; AVX2-NEXT:    vzeroupper
5348; AVX2-NEXT:    retq
5349;
5350; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5351; AVX512F:       # %bb.0:
5352; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5353; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
5354; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5355; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5356; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5357; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5358; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
5359; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
5360; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
5361; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5362; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
5363; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
5364; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
5365; AVX512F-NEXT:    vzeroupper
5366; AVX512F-NEXT:    retq
5367;
5368; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5369; AVX512DQ:       # %bb.0:
5370; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5371; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
5372; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5373; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5374; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5375; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5376; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
5377; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
5378; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
5379; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
5380; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
5381; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
5382; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
5383; AVX512DQ-NEXT:    vzeroupper
5384; AVX512DQ-NEXT:    retq
5385;
5386; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5387; AVX512BW-SLOW:       # %bb.0:
5388; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
5389; AVX512BW-SLOW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11]
5390; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5391; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5392; AVX512BW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
5393; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5394; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5395; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5396; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5397; AVX512BW-SLOW-NEXT:    vzeroupper
5398; AVX512BW-SLOW-NEXT:    retq
5399;
5400; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5401; AVX512BW-FAST:       # %bb.0:
5402; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
5403; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
5404; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5405; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm1
5406; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5407; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5408; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5409; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5410; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5411; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
5412; AVX512BW-FAST-NEXT:    vzeroupper
5413; AVX512BW-FAST-NEXT:    retq
5414  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5415  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5416  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5417  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5418  %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 0, i32 9, i32 0, i32 11>
5419  %out.bytevec = bitcast <6 x i64> %broadcast.of.zextinreg to <48 x i8>
5420  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5421  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5422  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5423  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5424  ret void
5425}
5426
5427define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5428; SSE2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5429; SSE2:       # %bb.0:
5430; SSE2-NEXT:    movdqa (%rdi), %xmm0
5431; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
5432; SSE2-NEXT:    paddb (%rsi), %xmm0
5433; SSE2-NEXT:    paddb 48(%rsi), %xmm1
5434; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5435; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
5436; SSE2-NEXT:    movaps 32(%rdx), %xmm2
5437; SSE2-NEXT:    paddb (%rdx), %xmm1
5438; SSE2-NEXT:    paddb 16(%rdx), %xmm0
5439; SSE2-NEXT:    movaps %xmm2, 32(%rcx)
5440; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
5441; SSE2-NEXT:    movdqa %xmm1, (%rcx)
5442; SSE2-NEXT:    retq
5443;
5444; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5445; SSE42:       # %bb.0:
5446; SSE42-NEXT:    movdqa (%rdi), %xmm0
5447; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
5448; SSE42-NEXT:    paddb 48(%rsi), %xmm1
5449; SSE42-NEXT:    paddb (%rsi), %xmm0
5450; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5451; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
5452; SSE42-NEXT:    movaps 32(%rdx), %xmm2
5453; SSE42-NEXT:    paddb (%rdx), %xmm1
5454; SSE42-NEXT:    paddb 16(%rdx), %xmm0
5455; SSE42-NEXT:    movaps %xmm2, 32(%rcx)
5456; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
5457; SSE42-NEXT:    movdqa %xmm1, (%rcx)
5458; SSE42-NEXT:    retq
5459;
5460; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5461; AVX:       # %bb.0:
5462; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5463; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
5464; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
5465; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5466; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5467; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
5468; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
5469; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
5470; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
5471; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
5472; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
5473; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
5474; AVX-NEXT:    vzeroupper
5475; AVX-NEXT:    retq
5476;
5477; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5478; AVX2:       # %bb.0:
5479; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5480; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
5481; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5482; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5483; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5484; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0]
5485; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5486; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
5487; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
5488; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5489; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
5490; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5491; AVX2-NEXT:    vzeroupper
5492; AVX2-NEXT:    retq
5493;
5494; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5495; AVX512F:       # %bb.0:
5496; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5497; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
5498; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5499; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5500; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5501; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5502; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0]
5503; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
5504; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
5505; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
5506; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
5507; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5508; AVX512F-NEXT:    vzeroupper
5509; AVX512F-NEXT:    retq
5510;
5511; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5512; AVX512DQ:       # %bb.0:
5513; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5514; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
5515; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
5516; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5517; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5518; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5519; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0]
5520; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
5521; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
5522; AVX512DQ-NEXT:    vmovaps 32(%rdx), %ymm1
5523; AVX512DQ-NEXT:    vmovaps %ymm1, 32(%rcx)
5524; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5525; AVX512DQ-NEXT:    vzeroupper
5526; AVX512DQ-NEXT:    retq
5527;
5528; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5529; AVX512BW-SLOW:       # %bb.0:
5530; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
5531; AVX512BW-SLOW-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0]
5532; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5533; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
5534; AVX512BW-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
5535; AVX512BW-SLOW-NEXT:    vmovdqa %ymm0, %ymm0
5536; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5537; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5538; AVX512BW-SLOW-NEXT:    vzeroupper
5539; AVX512BW-SLOW-NEXT:    retq
5540;
5541; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5542; AVX512BW-FAST:       # %bb.0:
5543; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
5544; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,2,0]
5545; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5546; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
5547; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5548; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
5549; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5550; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
5551; AVX512BW-FAST-NEXT:    vzeroupper
5552; AVX512BW-FAST-NEXT:    retq
5553  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5554  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5555  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5556  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5557  %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 0, i32 10, i32 11>
5558  %out.bytevec = bitcast <6 x i64> %broadcast.of.zextinreg to <48 x i8>
5559  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5560  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5561  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5562  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5563  ret void
5564}
5565
5566define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5567; SSE2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5568; SSE2:       # %bb.0:
5569; SSE2-NEXT:    movdqa (%rdi), %xmm0
5570; SSE2-NEXT:    paddb (%rsi), %xmm0
5571; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5572; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5573; SSE2-NEXT:    pxor %xmm1, %xmm1
5574; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5575; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
5576; SSE2-NEXT:    paddb %xmm0, %xmm1
5577; SSE2-NEXT:    movdqa (%rdx), %xmm2
5578; SSE2-NEXT:    paddb %xmm0, %xmm2
5579; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
5580; SSE2-NEXT:    paddb %xmm0, %xmm3
5581; SSE2-NEXT:    paddb 32(%rdx), %xmm0
5582; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
5583; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
5584; SSE2-NEXT:    movdqa %xmm2, (%rcx)
5585; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
5586; SSE2-NEXT:    retq
5587;
5588; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5589; SSE42:       # %bb.0:
5590; SSE42-NEXT:    movdqa (%rdi), %xmm0
5591; SSE42-NEXT:    paddb (%rsi), %xmm0
5592; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
5593; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
5594; SSE42-NEXT:    paddb %xmm0, %xmm1
5595; SSE42-NEXT:    movdqa (%rdx), %xmm2
5596; SSE42-NEXT:    paddb %xmm0, %xmm2
5597; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
5598; SSE42-NEXT:    paddb %xmm0, %xmm3
5599; SSE42-NEXT:    paddb 32(%rdx), %xmm0
5600; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
5601; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
5602; SSE42-NEXT:    movdqa %xmm2, (%rcx)
5603; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
5604; SSE42-NEXT:    retq
5605;
5606; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5607; AVX:       # %bb.0:
5608; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5609; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5610; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
5611; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5612; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5613; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5614; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5615; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5616; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5617; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5618; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5619; AVX-NEXT:    retq
5620;
5621; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5622; AVX2:       # %bb.0:
5623; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5624; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5625; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5626; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero
5627; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5628; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5629; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5630; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5631; AVX2-NEXT:    vzeroupper
5632; AVX2-NEXT:    retq
5633;
5634; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5635; AVX512F:       # %bb.0:
5636; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5637; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5638; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5639; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero
5640; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5641; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5642; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5643; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5644; AVX512F-NEXT:    vzeroupper
5645; AVX512F-NEXT:    retq
5646;
5647; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5648; AVX512DQ:       # %bb.0:
5649; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5650; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5651; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5652; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero
5653; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5654; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5655; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5656; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5657; AVX512DQ-NEXT:    vzeroupper
5658; AVX512DQ-NEXT:    retq
5659;
5660; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5661; AVX512BW:       # %bb.0:
5662; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5663; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5664; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
5665; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero
5666; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5667; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5668; AVX512BW-NEXT:    vzeroupper
5669; AVX512BW-NEXT:    retq
5670  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5671  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5672  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5673  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95, i32 0, i32 97, i32 0, i32 99, i32 0, i32 101, i32 0, i32 103, i32 0, i32 105, i32 0, i32 107, i32 0, i32 109, i32 0, i32 111, i32 0, i32 113, i32 0, i32 115, i32 0, i32 117, i32 0, i32 119, i32 0, i32 121, i32 0, i32 123, i32 0, i32 125, i32 0, i32 127>
5674  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5675  %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5676  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5677  ret void
5678}
5679
5680define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5681; SSE2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5682; SSE2:       # %bb.0:
5683; SSE2-NEXT:    movdqa (%rdi), %xmm0
5684; SSE2-NEXT:    paddb (%rsi), %xmm0
5685; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5686; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5687; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
5688; SSE2-NEXT:    paddb %xmm0, %xmm1
5689; SSE2-NEXT:    movdqa (%rdx), %xmm2
5690; SSE2-NEXT:    paddb %xmm0, %xmm2
5691; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
5692; SSE2-NEXT:    paddb %xmm0, %xmm3
5693; SSE2-NEXT:    paddb 32(%rdx), %xmm0
5694; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
5695; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
5696; SSE2-NEXT:    movdqa %xmm2, (%rcx)
5697; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
5698; SSE2-NEXT:    retq
5699;
5700; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5701; SSE42:       # %bb.0:
5702; SSE42-NEXT:    movdqa (%rdi), %xmm0
5703; SSE42-NEXT:    paddb (%rsi), %xmm0
5704; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
5705; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
5706; SSE42-NEXT:    paddb %xmm0, %xmm1
5707; SSE42-NEXT:    movdqa (%rdx), %xmm2
5708; SSE42-NEXT:    paddb %xmm0, %xmm2
5709; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
5710; SSE42-NEXT:    paddb %xmm0, %xmm3
5711; SSE42-NEXT:    paddb 32(%rdx), %xmm0
5712; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
5713; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
5714; SSE42-NEXT:    movdqa %xmm2, (%rcx)
5715; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
5716; SSE42-NEXT:    retq
5717;
5718; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5719; AVX:       # %bb.0:
5720; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5721; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5722; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
5723; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5724; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5725; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5726; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5727; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5728; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5729; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5730; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5731; AVX-NEXT:    retq
5732;
5733; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5734; AVX2:       # %bb.0:
5735; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5736; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5737; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5738; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero
5739; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5740; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5741; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5742; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5743; AVX2-NEXT:    vzeroupper
5744; AVX2-NEXT:    retq
5745;
5746; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5747; AVX512F:       # %bb.0:
5748; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5749; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5750; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5751; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero
5752; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5753; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5754; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5755; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5756; AVX512F-NEXT:    vzeroupper
5757; AVX512F-NEXT:    retq
5758;
5759; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5760; AVX512DQ:       # %bb.0:
5761; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5762; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5763; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5764; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero
5765; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5766; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5767; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5768; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5769; AVX512DQ-NEXT:    vzeroupper
5770; AVX512DQ-NEXT:    retq
5771;
5772; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5773; AVX512BW:       # %bb.0:
5774; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5775; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5776; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
5777; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero
5778; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5779; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5780; AVX512BW-NEXT:    vzeroupper
5781; AVX512BW-NEXT:    retq
5782  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5783  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5784  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5785  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 0, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 0, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 0, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 0, i32 125, i32 126, i32 127>
5786  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5787  %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5788  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5789  ret void
5790}
5791
5792define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5793; SSE2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5794; SSE2:       # %bb.0:
5795; SSE2-NEXT:    movdqa (%rdi), %xmm0
5796; SSE2-NEXT:    paddb (%rsi), %xmm0
5797; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5798; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5799; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
5800; SSE2-NEXT:    paddb %xmm0, %xmm1
5801; SSE2-NEXT:    movdqa (%rdx), %xmm2
5802; SSE2-NEXT:    paddb %xmm0, %xmm2
5803; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
5804; SSE2-NEXT:    paddb %xmm0, %xmm3
5805; SSE2-NEXT:    paddb 32(%rdx), %xmm0
5806; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
5807; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
5808; SSE2-NEXT:    movdqa %xmm2, (%rcx)
5809; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
5810; SSE2-NEXT:    retq
5811;
5812; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5813; SSE42:       # %bb.0:
5814; SSE42-NEXT:    movdqa (%rdi), %xmm0
5815; SSE42-NEXT:    paddb (%rsi), %xmm0
5816; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
5817; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
5818; SSE42-NEXT:    paddb %xmm0, %xmm1
5819; SSE42-NEXT:    movdqa (%rdx), %xmm2
5820; SSE42-NEXT:    paddb %xmm0, %xmm2
5821; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
5822; SSE42-NEXT:    paddb %xmm0, %xmm3
5823; SSE42-NEXT:    paddb 32(%rdx), %xmm0
5824; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
5825; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
5826; SSE42-NEXT:    movdqa %xmm2, (%rcx)
5827; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
5828; SSE42-NEXT:    retq
5829;
5830; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5831; AVX:       # %bb.0:
5832; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5833; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5834; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
5835; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5836; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5837; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5838; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5839; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5840; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5841; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5842; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5843; AVX-NEXT:    retq
5844;
5845; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5846; AVX2:       # %bb.0:
5847; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5848; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5849; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5850; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero
5851; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5852; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5853; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5854; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5855; AVX2-NEXT:    vzeroupper
5856; AVX2-NEXT:    retq
5857;
5858; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5859; AVX512F:       # %bb.0:
5860; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5861; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5862; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5863; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero
5864; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5865; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5866; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5867; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5868; AVX512F-NEXT:    vzeroupper
5869; AVX512F-NEXT:    retq
5870;
5871; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5872; AVX512DQ:       # %bb.0:
5873; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5874; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5875; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5876; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero
5877; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5878; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5879; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5880; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5881; AVX512DQ-NEXT:    vzeroupper
5882; AVX512DQ-NEXT:    retq
5883;
5884; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5885; AVX512BW:       # %bb.0:
5886; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5887; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5888; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
5889; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero
5890; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5891; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5892; AVX512BW-NEXT:    vzeroupper
5893; AVX512BW-NEXT:    retq
5894  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5895  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5896  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5897  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5898  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5899  %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5900  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5901  ret void
5902}
5903
5904define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5905; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5906; SSE:       # %bb.0:
5907; SSE-NEXT:    movdqa (%rdi), %xmm0
5908; SSE-NEXT:    paddb (%rsi), %xmm0
5909; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5910; SSE-NEXT:    movdqa 16(%rdx), %xmm1
5911; SSE-NEXT:    paddb %xmm0, %xmm1
5912; SSE-NEXT:    movdqa (%rdx), %xmm2
5913; SSE-NEXT:    paddb %xmm0, %xmm2
5914; SSE-NEXT:    movdqa 48(%rdx), %xmm3
5915; SSE-NEXT:    paddb %xmm0, %xmm3
5916; SSE-NEXT:    paddb 32(%rdx), %xmm0
5917; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
5918; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
5919; SSE-NEXT:    movdqa %xmm2, (%rcx)
5920; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
5921; SSE-NEXT:    retq
5922;
5923; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5924; AVX:       # %bb.0:
5925; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5926; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5927; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5928; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5929; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5930; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5931; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5932; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5933; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5934; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5935; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5936; AVX-NEXT:    retq
5937;
5938; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5939; AVX2:       # %bb.0:
5940; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5941; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5942; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5943; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5944; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5945; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5946; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5947; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5948; AVX2-NEXT:    vzeroupper
5949; AVX2-NEXT:    retq
5950;
5951; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5952; AVX512F:       # %bb.0:
5953; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5954; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5955; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5956; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5957; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5958; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5959; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5960; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5961; AVX512F-NEXT:    vzeroupper
5962; AVX512F-NEXT:    retq
5963;
5964; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5965; AVX512DQ:       # %bb.0:
5966; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5967; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5968; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5969; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5970; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5971; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5972; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5973; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5974; AVX512DQ-NEXT:    vzeroupper
5975; AVX512DQ-NEXT:    retq
5976;
5977; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5978; AVX512BW:       # %bb.0:
5979; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5980; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5981; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
5982; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
5983; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5984; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5985; AVX512BW-NEXT:    vzeroupper
5986; AVX512BW-NEXT:    retq
5987  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5988  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5989  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5990  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5991  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5992  %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5993  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5994  ret void
5995}
5996
5997define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5998; SSE-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5999; SSE:       # %bb.0:
6000; SSE-NEXT:    movdqa (%rdi), %xmm0
6001; SSE-NEXT:    paddb (%rsi), %xmm0
6002; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6003; SSE-NEXT:    movaps 16(%rdx), %xmm1
6004; SSE-NEXT:    movaps 48(%rdx), %xmm2
6005; SSE-NEXT:    movdqa (%rdx), %xmm3
6006; SSE-NEXT:    paddb %xmm0, %xmm3
6007; SSE-NEXT:    paddb 32(%rdx), %xmm0
6008; SSE-NEXT:    movaps %xmm2, 48(%rcx)
6009; SSE-NEXT:    movaps %xmm1, 16(%rcx)
6010; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
6011; SSE-NEXT:    movdqa %xmm3, (%rcx)
6012; SSE-NEXT:    retq
6013;
6014; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6015; AVX:       # %bb.0:
6016; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6017; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6018; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
6019; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm1
6020; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6021; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6022; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
6023; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6024; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
6025; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6026; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
6027; AVX-NEXT:    retq
6028;
6029; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6030; AVX2:       # %bb.0:
6031; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6032; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6033; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6034; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
6035; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6036; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6037; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6038; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6039; AVX2-NEXT:    vzeroupper
6040; AVX2-NEXT:    retq
6041;
6042; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6043; AVX512F:       # %bb.0:
6044; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6045; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6046; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6047; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
6048; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6049; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6050; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6051; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6052; AVX512F-NEXT:    vzeroupper
6053; AVX512F-NEXT:    retq
6054;
6055; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6056; AVX512DQ:       # %bb.0:
6057; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6058; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6059; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = [255,0]
6060; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
6061; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6062; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6063; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
6064; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
6065; AVX512DQ-NEXT:    vzeroupper
6066; AVX512DQ-NEXT:    retq
6067;
6068; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6069; AVX512BW:       # %bb.0:
6070; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
6071; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6072; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6073; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6074; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6075; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6076; AVX512BW-NEXT:    vzeroupper
6077; AVX512BW-NEXT:    retq
6078  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6079  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6080  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6081  %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6082  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6083  %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
6084  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6085  ret void
6086}
6087
6088define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6089; SSE2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6090; SSE2:       # %bb.0:
6091; SSE2-NEXT:    movdqa (%rdi), %xmm0
6092; SSE2-NEXT:    paddb (%rsi), %xmm0
6093; SSE2-NEXT:    pxor %xmm1, %xmm1
6094; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
6095; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6096; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
6097; SSE2-NEXT:    paddb %xmm0, %xmm1
6098; SSE2-NEXT:    movdqa (%rdx), %xmm2
6099; SSE2-NEXT:    paddb %xmm0, %xmm2
6100; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
6101; SSE2-NEXT:    paddb %xmm0, %xmm3
6102; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6103; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6104; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
6105; SSE2-NEXT:    movdqa %xmm2, (%rcx)
6106; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
6107; SSE2-NEXT:    retq
6108;
6109; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6110; SSE42:       # %bb.0:
6111; SSE42-NEXT:    movdqa (%rdi), %xmm0
6112; SSE42-NEXT:    paddb (%rsi), %xmm0
6113; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
6114; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6115; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
6116; SSE42-NEXT:    paddb %xmm0, %xmm1
6117; SSE42-NEXT:    movdqa (%rdx), %xmm2
6118; SSE42-NEXT:    paddb %xmm0, %xmm2
6119; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
6120; SSE42-NEXT:    paddb %xmm0, %xmm3
6121; SSE42-NEXT:    paddb 32(%rdx), %xmm0
6122; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
6123; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6124; SSE42-NEXT:    movdqa %xmm2, (%rcx)
6125; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
6126; SSE42-NEXT:    retq
6127;
6128; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6129; AVX:       # %bb.0:
6130; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6131; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6132; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
6133; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6134; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
6135; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
6136; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
6137; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6138; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6139; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
6140; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
6141; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
6142; AVX-NEXT:    retq
6143;
6144; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6145; AVX2:       # %bb.0:
6146; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6147; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6148; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6149; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero
6150; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6151; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6152; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6153; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6154; AVX2-NEXT:    vzeroupper
6155; AVX2-NEXT:    retq
6156;
6157; AVX512F-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6158; AVX512F:       # %bb.0:
6159; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6160; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6161; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6162; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero
6163; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6164; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6165; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6166; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6167; AVX512F-NEXT:    vzeroupper
6168; AVX512F-NEXT:    retq
6169;
6170; AVX512DQ-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6171; AVX512DQ:       # %bb.0:
6172; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6173; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6174; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6175; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero
6176; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6177; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6178; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
6179; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
6180; AVX512DQ-NEXT:    vzeroupper
6181; AVX512DQ-NEXT:    retq
6182;
6183; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6184; AVX512BW:       # %bb.0:
6185; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6186; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6187; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6188; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63]
6189; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6190; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
6191; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6192; AVX512BW-NEXT:    vzeroupper
6193; AVX512BW-NEXT:    retq
6194  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6195  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6196  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6197  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6198  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
6199  %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6200  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6201  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6202  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6203  ret void
6204}
6205
6206define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6207; SSE2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6208; SSE2:       # %bb.0:
6209; SSE2-NEXT:    movdqa (%rdi), %xmm0
6210; SSE2-NEXT:    paddb (%rsi), %xmm0
6211; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6212; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6213; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
6214; SSE2-NEXT:    paddb %xmm0, %xmm1
6215; SSE2-NEXT:    movdqa (%rdx), %xmm2
6216; SSE2-NEXT:    paddb %xmm0, %xmm2
6217; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
6218; SSE2-NEXT:    paddb %xmm0, %xmm3
6219; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6220; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6221; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
6222; SSE2-NEXT:    movdqa %xmm2, (%rcx)
6223; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
6224; SSE2-NEXT:    retq
6225;
6226; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6227; SSE42:       # %bb.0:
6228; SSE42-NEXT:    movdqa (%rdi), %xmm0
6229; SSE42-NEXT:    paddb (%rsi), %xmm0
6230; SSE42-NEXT:    pxor %xmm1, %xmm1
6231; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6232; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
6233; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
6234; SSE42-NEXT:    paddb %xmm0, %xmm1
6235; SSE42-NEXT:    movdqa (%rdx), %xmm2
6236; SSE42-NEXT:    paddb %xmm0, %xmm2
6237; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
6238; SSE42-NEXT:    paddb %xmm0, %xmm3
6239; SSE42-NEXT:    paddb 32(%rdx), %xmm0
6240; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
6241; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6242; SSE42-NEXT:    movdqa %xmm2, (%rcx)
6243; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
6244; SSE42-NEXT:    retq
6245;
6246; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6247; AVX:       # %bb.0:
6248; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6249; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6250; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6251; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6252; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
6253; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
6254; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
6255; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
6256; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6257; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6258; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
6259; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
6260; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
6261; AVX-NEXT:    retq
6262;
6263; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6264; AVX2:       # %bb.0:
6265; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6266; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6267; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6268; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero
6269; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6270; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6271; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6272; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6273; AVX2-NEXT:    vzeroupper
6274; AVX2-NEXT:    retq
6275;
6276; AVX512F-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6277; AVX512F:       # %bb.0:
6278; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6279; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6280; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6281; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero
6282; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6283; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6284; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6285; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6286; AVX512F-NEXT:    vzeroupper
6287; AVX512F-NEXT:    retq
6288;
6289; AVX512DQ-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6290; AVX512DQ:       # %bb.0:
6291; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6292; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6293; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6294; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero
6295; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6296; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6297; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
6298; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
6299; AVX512DQ-NEXT:    vzeroupper
6300; AVX512DQ-NEXT:    retq
6301;
6302; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6303; AVX512BW:       # %bb.0:
6304; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6305; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6306; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6307; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63]
6308; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6309; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
6310; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6311; AVX512BW-NEXT:    vzeroupper
6312; AVX512BW-NEXT:    retq
6313  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6314  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6315  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6316  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6317  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
6318  %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6319  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6320  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6321  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6322  ret void
6323}
6324
6325define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6326; SSE2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6327; SSE2:       # %bb.0:
6328; SSE2-NEXT:    movdqa (%rdi), %xmm0
6329; SSE2-NEXT:    paddb (%rsi), %xmm0
6330; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6331; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
6332; SSE2-NEXT:    paddb %xmm0, %xmm1
6333; SSE2-NEXT:    movdqa (%rdx), %xmm2
6334; SSE2-NEXT:    paddb %xmm0, %xmm2
6335; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
6336; SSE2-NEXT:    paddb %xmm0, %xmm3
6337; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6338; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6339; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
6340; SSE2-NEXT:    movdqa %xmm2, (%rcx)
6341; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
6342; SSE2-NEXT:    retq
6343;
6344; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6345; SSE42:       # %bb.0:
6346; SSE42-NEXT:    movdqa (%rdi), %xmm0
6347; SSE42-NEXT:    paddb (%rsi), %xmm0
6348; SSE42-NEXT:    pxor %xmm1, %xmm1
6349; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6350; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
6351; SSE42-NEXT:    paddb %xmm1, %xmm0
6352; SSE42-NEXT:    movdqa (%rdx), %xmm2
6353; SSE42-NEXT:    paddb %xmm1, %xmm2
6354; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
6355; SSE42-NEXT:    paddb %xmm1, %xmm3
6356; SSE42-NEXT:    paddb 32(%rdx), %xmm1
6357; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
6358; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6359; SSE42-NEXT:    movdqa %xmm2, (%rcx)
6360; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
6361; SSE42-NEXT:    retq
6362;
6363; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6364; AVX:       # %bb.0:
6365; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6366; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6367; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6368; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6369; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
6370; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
6371; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
6372; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6373; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6374; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
6375; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
6376; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
6377; AVX-NEXT:    retq
6378;
6379; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6380; AVX2:       # %bb.0:
6381; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6382; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6383; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6384; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6385; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6386; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6387; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6388; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6389; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6390; AVX2-NEXT:    vzeroupper
6391; AVX2-NEXT:    retq
6392;
6393; AVX512F-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6394; AVX512F:       # %bb.0:
6395; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6396; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6397; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6398; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6399; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6400; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6401; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6402; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6403; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6404; AVX512F-NEXT:    vzeroupper
6405; AVX512F-NEXT:    retq
6406;
6407; AVX512DQ-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6408; AVX512DQ:       # %bb.0:
6409; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6410; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6411; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6412; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6413; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6414; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6415; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6416; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
6417; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
6418; AVX512DQ-NEXT:    vzeroupper
6419; AVX512DQ-NEXT:    retq
6420;
6421; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6422; AVX512BW:       # %bb.0:
6423; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6424; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6425; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6426; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63]
6427; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
6428; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
6429; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6430; AVX512BW-NEXT:    vzeroupper
6431; AVX512BW-NEXT:    retq
6432  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6433  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6434  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6435  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6436  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6437  %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6438  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6439  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6440  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6441  ret void
6442}
6443
6444define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6445; SSE2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6446; SSE2:       # %bb.0:
6447; SSE2-NEXT:    movdqa (%rdi), %xmm0
6448; SSE2-NEXT:    paddb (%rsi), %xmm0
6449; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6450; SSE2-NEXT:    movaps 16(%rdx), %xmm1
6451; SSE2-NEXT:    movaps 48(%rdx), %xmm2
6452; SSE2-NEXT:    movdqa (%rdx), %xmm3
6453; SSE2-NEXT:    paddb %xmm0, %xmm3
6454; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6455; SSE2-NEXT:    movaps %xmm2, 48(%rcx)
6456; SSE2-NEXT:    movaps %xmm1, 16(%rcx)
6457; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6458; SSE2-NEXT:    movdqa %xmm3, (%rcx)
6459; SSE2-NEXT:    retq
6460;
6461; SSE42-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6462; SSE42:       # %bb.0:
6463; SSE42-NEXT:    movdqa (%rdi), %xmm0
6464; SSE42-NEXT:    paddb (%rsi), %xmm0
6465; SSE42-NEXT:    pxor %xmm1, %xmm1
6466; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6467; SSE42-NEXT:    movaps 16(%rdx), %xmm0
6468; SSE42-NEXT:    movaps 48(%rdx), %xmm2
6469; SSE42-NEXT:    movdqa (%rdx), %xmm3
6470; SSE42-NEXT:    paddb %xmm1, %xmm3
6471; SSE42-NEXT:    paddb 32(%rdx), %xmm1
6472; SSE42-NEXT:    movaps %xmm2, 48(%rcx)
6473; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
6474; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
6475; SSE42-NEXT:    movdqa %xmm3, (%rcx)
6476; SSE42-NEXT:    retq
6477;
6478; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6479; AVX:       # %bb.0:
6480; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6481; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6482; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6483; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6484; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm1
6485; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6486; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6487; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
6488; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6489; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
6490; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6491; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
6492; AVX-NEXT:    retq
6493;
6494; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6495; AVX2:       # %bb.0:
6496; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6497; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6498; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6499; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
6500; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6501; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6502; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6503; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6504; AVX2-NEXT:    vzeroupper
6505; AVX2-NEXT:    retq
6506;
6507; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6508; AVX512F:       # %bb.0:
6509; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6510; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6511; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6512; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
6513; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6514; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6515; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
6516; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
6517; AVX512F-NEXT:    vzeroupper
6518; AVX512F-NEXT:    retq
6519;
6520; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6521; AVX512DQ:       # %bb.0:
6522; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6523; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6524; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6525; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
6526; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6527; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6528; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
6529; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
6530; AVX512DQ-NEXT:    vzeroupper
6531; AVX512DQ-NEXT:    retq
6532;
6533; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6534; AVX512BW:       # %bb.0:
6535; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
6536; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6537; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6538; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6539; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6540; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6541; AVX512BW-NEXT:    vzeroupper
6542; AVX512BW-NEXT:    retq
6543  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6544  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6545  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6546  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6547  %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6548  %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6549  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6550  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6551  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6552  ret void
6553}
6554
6555define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6556; SSE2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6557; SSE2:       # %bb.0:
6558; SSE2-NEXT:    movdqa (%rdi), %xmm0
6559; SSE2-NEXT:    paddb (%rsi), %xmm0
6560; SSE2-NEXT:    pxor %xmm1, %xmm1
6561; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
6562; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6563; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
6564; SSE2-NEXT:    paddb %xmm0, %xmm1
6565; SSE2-NEXT:    movdqa (%rdx), %xmm2
6566; SSE2-NEXT:    paddb %xmm0, %xmm2
6567; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
6568; SSE2-NEXT:    paddb %xmm0, %xmm3
6569; SSE2-NEXT:    paddb 32(%rdx), %xmm0
6570; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
6571; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
6572; SSE2-NEXT:    movdqa %xmm2, (%rcx)
6573; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
6574; SSE2-NEXT:    retq
6575;
6576; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6577; SSE42:       # %bb.0:
6578; SSE42-NEXT:    movdqa (%rdi), %xmm0
6579; SSE42-NEXT:    paddb (%rsi), %xmm0
6580; SSE42-NEXT:    pxor %xmm1, %xmm1
6581; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6582; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
6583; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
6584; SSE42-NEXT:    paddb %xmm0, %xmm1
6585; SSE42-NEXT:    movdqa (%rdx), %xmm2
6586; SSE42-NEXT:    paddb %xmm0, %xmm2
6587; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
6588; SSE42-NEXT:    paddb %xmm0, %xmm3
6589; SSE42-NEXT:    paddb 32(%rdx), %xmm0
6590; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
6591; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6592; SSE42-NEXT:    movdqa %xmm2, (%rcx)
6593; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
6594; SSE42-NEXT:    retq
6595;
6596; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6597; AVX:       # %bb.0:
6598; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6599; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6600; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
6601; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6602; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
6603; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
6604; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
6605; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm2
6606; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
6607; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
6608; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6609; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6610; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
6611; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6612; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
6613; AVX-NEXT:    vzeroupper
6614; AVX-NEXT:    retq
6615;
6616; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6617; AVX2:       # %bb.0:
6618; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
6619; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
6620; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
6621; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
6622; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6623; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6624; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6625; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6626; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6627; AVX2-NEXT:    vzeroupper
6628; AVX2-NEXT:    retq
6629;
6630; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6631; AVX512F:       # %bb.0:
6632; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6633; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6634; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6635; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
6636; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
6637; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6638; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6639; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6640; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
6641; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
6642; AVX512F-NEXT:    vzeroupper
6643; AVX512F-NEXT:    retq
6644;
6645; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6646; AVX512DQ:       # %bb.0:
6647; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6648; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6649; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6650; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
6651; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
6652; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6653; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6654; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6655; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
6656; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
6657; AVX512DQ-NEXT:    vzeroupper
6658; AVX512DQ-NEXT:    retq
6659;
6660; AVX512BW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6661; AVX512BW:       # %bb.0:
6662; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6663; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6664; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6665; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
6666; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
6667; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
6668; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6669; AVX512BW-NEXT:    vzeroupper
6670; AVX512BW-NEXT:    retq
6671  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6672  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6673  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6674  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
6675  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
6676  %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8>
6677  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6678  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6679  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6680  ret void
6681}
6682
6683define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6684; SSE2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6685; SSE2:       # %bb.0:
6686; SSE2-NEXT:    movdqa (%rdi), %xmm0
6687; SSE2-NEXT:    paddb (%rsi), %xmm0
6688; SSE2-NEXT:    xorps %xmm1, %xmm1
6689; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
6690; SSE2-NEXT:    movdqa 16(%rdx), %xmm0
6691; SSE2-NEXT:    paddb %xmm1, %xmm0
6692; SSE2-NEXT:    movdqa (%rdx), %xmm2
6693; SSE2-NEXT:    paddb %xmm1, %xmm2
6694; SSE2-NEXT:    movdqa 48(%rdx), %xmm3
6695; SSE2-NEXT:    paddb %xmm1, %xmm3
6696; SSE2-NEXT:    paddb 32(%rdx), %xmm1
6697; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
6698; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
6699; SSE2-NEXT:    movdqa %xmm2, (%rcx)
6700; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
6701; SSE2-NEXT:    retq
6702;
6703; SSE42-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6704; SSE42:       # %bb.0:
6705; SSE42-NEXT:    movdqa (%rdi), %xmm0
6706; SSE42-NEXT:    paddb (%rsi), %xmm0
6707; SSE42-NEXT:    pxor %xmm1, %xmm1
6708; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6709; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
6710; SSE42-NEXT:    paddb %xmm1, %xmm0
6711; SSE42-NEXT:    movdqa (%rdx), %xmm2
6712; SSE42-NEXT:    paddb %xmm1, %xmm2
6713; SSE42-NEXT:    movdqa 48(%rdx), %xmm3
6714; SSE42-NEXT:    paddb %xmm1, %xmm3
6715; SSE42-NEXT:    paddb 32(%rdx), %xmm1
6716; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
6717; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
6718; SSE42-NEXT:    movdqa %xmm2, (%rcx)
6719; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
6720; SSE42-NEXT:    retq
6721;
6722; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6723; AVX:       # %bb.0:
6724; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6725; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6726; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
6727; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6728; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
6729; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
6730; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm2
6731; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
6732; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
6733; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6734; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6735; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
6736; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6737; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
6738; AVX-NEXT:    vzeroupper
6739; AVX-NEXT:    retq
6740;
6741; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6742; AVX2:       # %bb.0:
6743; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6744; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6745; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6746; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6747; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
6748; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6749; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6750; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6751; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6752; AVX2-NEXT:    vzeroupper
6753; AVX2-NEXT:    retq
6754;
6755; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6756; AVX512F:       # %bb.0:
6757; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6758; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6759; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
6760; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6761; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
6762; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6763; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6764; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6765; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
6766; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
6767; AVX512F-NEXT:    vzeroupper
6768; AVX512F-NEXT:    retq
6769;
6770; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6771; AVX512DQ:       # %bb.0:
6772; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6773; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6774; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
6775; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6776; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
6777; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6778; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6779; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6780; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
6781; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
6782; AVX512DQ-NEXT:    vzeroupper
6783; AVX512DQ-NEXT:    retq
6784;
6785; AVX512BW-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6786; AVX512BW:       # %bb.0:
6787; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
6788; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
6789; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6790; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
6791; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
6792; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
6793; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6794; AVX512BW-NEXT:    vzeroupper
6795; AVX512BW-NEXT:    retq
6796  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6797  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6798  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6799  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
6800  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
6801  %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8>
6802  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6803  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6804  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6805  ret void
6806}
6807
6808define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6809; SSE2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6810; SSE2:       # %bb.0:
6811; SSE2-NEXT:    movdqa (%rdi), %xmm0
6812; SSE2-NEXT:    paddb (%rsi), %xmm0
6813; SSE2-NEXT:    xorps %xmm1, %xmm1
6814; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
6815; SSE2-NEXT:    movaps 16(%rdx), %xmm0
6816; SSE2-NEXT:    movaps 48(%rdx), %xmm2
6817; SSE2-NEXT:    movdqa (%rdx), %xmm3
6818; SSE2-NEXT:    paddb %xmm1, %xmm3
6819; SSE2-NEXT:    paddb 32(%rdx), %xmm1
6820; SSE2-NEXT:    movaps %xmm2, 48(%rcx)
6821; SSE2-NEXT:    movaps %xmm0, 16(%rcx)
6822; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
6823; SSE2-NEXT:    movdqa %xmm3, (%rcx)
6824; SSE2-NEXT:    retq
6825;
6826; SSE42-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6827; SSE42:       # %bb.0:
6828; SSE42-NEXT:    movdqa (%rdi), %xmm0
6829; SSE42-NEXT:    paddb (%rsi), %xmm0
6830; SSE42-NEXT:    pxor %xmm1, %xmm1
6831; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6832; SSE42-NEXT:    movaps 16(%rdx), %xmm0
6833; SSE42-NEXT:    movaps 48(%rdx), %xmm2
6834; SSE42-NEXT:    movdqa (%rdx), %xmm3
6835; SSE42-NEXT:    paddb %xmm1, %xmm3
6836; SSE42-NEXT:    paddb 32(%rdx), %xmm1
6837; SSE42-NEXT:    movaps %xmm2, 48(%rcx)
6838; SSE42-NEXT:    movaps %xmm0, 16(%rcx)
6839; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
6840; SSE42-NEXT:    movdqa %xmm3, (%rcx)
6841; SSE42-NEXT:    retq
6842;
6843; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6844; AVX:       # %bb.0:
6845; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6846; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6847; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6848; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6849; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm1
6850; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6851; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
6852; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
6853; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
6854; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
6855; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6856; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
6857; AVX-NEXT:    retq
6858;
6859; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6860; AVX2:       # %bb.0:
6861; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
6862; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6863; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6864; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6865; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6866; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6867; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6868; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6869; AVX2-NEXT:    vzeroupper
6870; AVX2-NEXT:    retq
6871;
6872; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6873; AVX512F:       # %bb.0:
6874; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6875; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6876; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
6877; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6878; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
6879; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6880; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6881; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6882; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
6883; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
6884; AVX512F-NEXT:    vzeroupper
6885; AVX512F-NEXT:    retq
6886;
6887; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6888; AVX512DQ:       # %bb.0:
6889; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6890; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6891; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
6892; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
6893; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
6894; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6895; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6896; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6897; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
6898; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
6899; AVX512DQ-NEXT:    vzeroupper
6900; AVX512DQ-NEXT:    retq
6901;
6902; AVX512BW-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6903; AVX512BW:       # %bb.0:
6904; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
6905; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6906; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6907; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6908; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
6909; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
6910; AVX512BW-NEXT:    vzeroupper
6911; AVX512BW-NEXT:    retq
6912  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6913  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6914  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6915  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
6916  %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
6917  %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8>
6918  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6919  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6920  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6921  ret void
6922}
6923
6924define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6925; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6926; SSE:       # %bb.0:
6927; SSE-NEXT:    movdqa (%rdi), %xmm0
6928; SSE-NEXT:    paddb (%rsi), %xmm0
6929; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
6930; SSE-NEXT:    movdqa 16(%rdx), %xmm1
6931; SSE-NEXT:    paddb %xmm0, %xmm1
6932; SSE-NEXT:    movdqa (%rdx), %xmm2
6933; SSE-NEXT:    paddb %xmm0, %xmm2
6934; SSE-NEXT:    movdqa 48(%rdx), %xmm3
6935; SSE-NEXT:    paddb %xmm0, %xmm3
6936; SSE-NEXT:    paddb 32(%rdx), %xmm0
6937; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
6938; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
6939; SSE-NEXT:    movdqa %xmm2, (%rcx)
6940; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
6941; SSE-NEXT:    retq
6942;
6943; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6944; AVX:       # %bb.0:
6945; AVX-NEXT:    vmovdqa (%rdi), %xmm0
6946; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
6947; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6948; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
6949; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
6950; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
6951; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm2
6952; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
6953; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
6954; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
6955; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
6956; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
6957; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
6958; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
6959; AVX-NEXT:    vzeroupper
6960; AVX-NEXT:    retq
6961;
6962; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6963; AVX2:       # %bb.0:
6964; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
6965; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
6966; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
6967; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
6968; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6969; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
6970; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
6971; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
6972; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
6973; AVX2-NEXT:    vzeroupper
6974; AVX2-NEXT:    retq
6975;
6976; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6977; AVX512F:       # %bb.0:
6978; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
6979; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6980; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6981; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
6982; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
6983; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6984; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
6985; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
6986; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
6987; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
6988; AVX512F-NEXT:    vzeroupper
6989; AVX512F-NEXT:    retq
6990;
6991; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6992; AVX512DQ:       # %bb.0:
6993; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
6994; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
6995; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
6996; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
6997; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
6998; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
6999; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7000; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7001; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
7002; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
7003; AVX512DQ-NEXT:    vzeroupper
7004; AVX512DQ-NEXT:    retq
7005;
7006; AVX512BW-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
7007; AVX512BW:       # %bb.0:
7008; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
7009; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
7010; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7011; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
7012; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
7013; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
7014; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7015; AVX512BW-NEXT:    vzeroupper
7016; AVX512BW-NEXT:    retq
7017  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7018  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7019  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7020  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7021  %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
7022  %out.bytevec = bitcast <8 x i64> %broadcast.of.zextinreg to <64 x i8>
7023  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7024  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7025  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7026  ret void
7027}
7028
7029define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7030; SSE-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7031; SSE:       # %bb.0:
7032; SSE-NEXT:    movdqa (%rdi), %xmm0
7033; SSE-NEXT:    paddb (%rsi), %xmm0
7034; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
7035; SSE-NEXT:    movaps 16(%rdx), %xmm1
7036; SSE-NEXT:    movaps 48(%rdx), %xmm2
7037; SSE-NEXT:    movdqa (%rdx), %xmm3
7038; SSE-NEXT:    paddb %xmm0, %xmm3
7039; SSE-NEXT:    paddb 32(%rdx), %xmm0
7040; SSE-NEXT:    movaps %xmm2, 48(%rcx)
7041; SSE-NEXT:    movaps %xmm1, 16(%rcx)
7042; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
7043; SSE-NEXT:    movdqa %xmm3, (%rcx)
7044; SSE-NEXT:    retq
7045;
7046; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7047; AVX:       # %bb.0:
7048; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7049; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7050; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7051; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm1
7052; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7053; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7054; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
7055; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7056; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
7057; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7058; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
7059; AVX-NEXT:    retq
7060;
7061; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7062; AVX2:       # %bb.0:
7063; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7064; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7065; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
7066; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
7067; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7068; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
7069; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
7070; AVX2-NEXT:    vzeroupper
7071; AVX2-NEXT:    retq
7072;
7073; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7074; AVX512F:       # %bb.0:
7075; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7076; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7077; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7]
7078; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7079; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
7080; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
7081; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7082; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7083; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
7084; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
7085; AVX512F-NEXT:    vzeroupper
7086; AVX512F-NEXT:    retq
7087;
7088; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7089; AVX512DQ:       # %bb.0:
7090; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
7091; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7092; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7]
7093; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
7094; AVX512DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
7095; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
7096; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7097; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7098; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
7099; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
7100; AVX512DQ-NEXT:    vzeroupper
7101; AVX512DQ-NEXT:    retq
7102;
7103; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7104; AVX512BW:       # %bb.0:
7105; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
7106; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7107; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7108; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7109; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7110; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7111; AVX512BW-NEXT:    vzeroupper
7112; AVX512BW-NEXT:    retq
7113  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7114  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7115  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7116  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7117  %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
7118  %out.bytevec = bitcast <8 x i64> %broadcast.of.zextinreg to <64 x i8>
7119  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7120  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7121  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7122  ret void
7123}
7124
7125define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7126; SSE-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7127; SSE:       # %bb.0:
7128; SSE-NEXT:    movdqa (%rdi), %xmm0
7129; SSE-NEXT:    paddb (%rsi), %xmm0
7130; SSE-NEXT:    movaps 16(%rdx), %xmm1
7131; SSE-NEXT:    movaps 48(%rdx), %xmm2
7132; SSE-NEXT:    movdqa (%rdx), %xmm3
7133; SSE-NEXT:    paddb %xmm0, %xmm3
7134; SSE-NEXT:    paddb 32(%rdx), %xmm0
7135; SSE-NEXT:    movaps %xmm2, 48(%rcx)
7136; SSE-NEXT:    movaps %xmm1, 16(%rcx)
7137; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
7138; SSE-NEXT:    movdqa %xmm3, (%rcx)
7139; SSE-NEXT:    retq
7140;
7141; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7142; AVX:       # %bb.0:
7143; AVX-NEXT:    vmovdqa (%rdi), %xmm0
7144; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7145; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm1
7146; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
7147; AVX-NEXT:    vmovaps 16(%rdx), %xmm2
7148; AVX-NEXT:    vmovaps 48(%rdx), %xmm3
7149; AVX-NEXT:    vmovaps %xmm2, 16(%rcx)
7150; AVX-NEXT:    vmovaps %xmm3, 48(%rcx)
7151; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
7152; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
7153; AVX-NEXT:    retq
7154;
7155; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7156; AVX2:       # %bb.0:
7157; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
7158; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7159; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
7160; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
7161; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
7162; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
7163; AVX2-NEXT:    vzeroupper
7164; AVX2-NEXT:    retq
7165;
7166; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7167; AVX512F:       # %bb.0:
7168; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
7169; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7170; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7171; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
7172; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
7173; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
7174; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7175; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7176; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
7177; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
7178; AVX512F-NEXT:    vzeroupper
7179; AVX512F-NEXT:    retq
7180;
7181; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7182; AVX512DQ:       # %bb.0:
7183; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
7184; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
7185; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
7186; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
7187; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
7188; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
7189; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
7190; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
7191; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
7192; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
7193; AVX512DQ-NEXT:    vzeroupper
7194; AVX512DQ-NEXT:    retq
7195;
7196; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7197; AVX512BW:       # %bb.0:
7198; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
7199; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
7200; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7201; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
7202; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
7203; AVX512BW-NEXT:    vzeroupper
7204; AVX512BW-NEXT:    retq
7205  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7206  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7207  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7208  %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
7209  %broadcast.of.zextinreg = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
7210  %out.bytevec = bitcast <4 x i128> %broadcast.of.zextinreg to <64 x i8>
7211  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7212  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7213  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7214  ret void
7215}
7216;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
7217; AVX1-ONLY: {{.*}}
7218; FALLBACK0: {{.*}}
7219; FALLBACK1: {{.*}}
7220; FALLBACK10: {{.*}}
7221; FALLBACK11: {{.*}}
7222; FALLBACK12: {{.*}}
7223; FALLBACK13: {{.*}}
7224; FALLBACK2: {{.*}}
7225; FALLBACK3: {{.*}}
7226; FALLBACK4: {{.*}}
7227; FALLBACK5: {{.*}}
7228; FALLBACK6: {{.*}}
7229; FALLBACK7: {{.*}}
7230; FALLBACK8: {{.*}}
7231; FALLBACK9: {{.*}}
7232