xref: /llvm-project/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll (revision 95ab42661e8d1f57a4ef8e9d058b44627af0e58d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12
15; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13
16
17define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
18; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
19; SSE2:       # %bb.0:
20; SSE2-NEXT:    movdqa (%rdi), %xmm0
21; SSE2-NEXT:    paddb (%rsi), %xmm0
22; SSE2-NEXT:    pxor %xmm1, %xmm1
23; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
25; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
26; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
27; SSE2-NEXT:    packuswb %xmm0, %xmm0
28; SSE2-NEXT:    paddb (%rdx), %xmm0
29; SSE2-NEXT:    movdqa %xmm0, (%rcx)
30; SSE2-NEXT:    retq
31;
32; SSE42-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
33; SSE42:       # %bb.0:
34; SSE42-NEXT:    movdqa (%rdi), %xmm0
35; SSE42-NEXT:    paddb (%rsi), %xmm0
36; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
37; SSE42-NEXT:    paddb (%rdx), %xmm0
38; SSE42-NEXT:    movdqa %xmm0, (%rcx)
39; SSE42-NEXT:    retq
40;
41; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
42; AVX:       # %bb.0:
43; AVX-NEXT:    vmovdqa (%rdi), %xmm0
44; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
45; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
46; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
47; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
48; AVX-NEXT:    retq
49;
50; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
51; AVX2:       # %bb.0:
52; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
53; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
54; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
55; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
56; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
57; AVX2-NEXT:    vzeroupper
58; AVX2-NEXT:    retq
59;
60; AVX512F-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
61; AVX512F:       # %bb.0:
62; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
63; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
64; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
65; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
66; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
67; AVX512F-NEXT:    vzeroupper
68; AVX512F-NEXT:    retq
69;
70; AVX512DQ-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
71; AVX512DQ:       # %bb.0:
72; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
73; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
74; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
75; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
76; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
77; AVX512DQ-NEXT:    vzeroupper
78; AVX512DQ-NEXT:    retq
79;
80; AVX512BW-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
81; AVX512BW:       # %bb.0:
82; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
83; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
84; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
85; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
86; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
87; AVX512BW-NEXT:    vzeroupper
88; AVX512BW-NEXT:    retq
89  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
90  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
91  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
92  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
93  %out.bytevec.padded = shufflevector <4 x i8> %broadcast.of.aextinreg, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
94  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
95  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
96  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
97  ret void
98}
99
100define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
101; SSE2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
102; SSE2:       # %bb.0:
103; SSE2-NEXT:    movdqa (%rdi), %xmm0
104; SSE2-NEXT:    paddb (%rsi), %xmm0
105; SSE2-NEXT:    pxor %xmm1, %xmm1
106; SSE2-NEXT:    movdqa %xmm0, %xmm2
107; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
108; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
109; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
110; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
111; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
112; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
113; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
114; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
115; SSE2-NEXT:    packuswb %xmm2, %xmm2
116; SSE2-NEXT:    paddb (%rdx), %xmm2
117; SSE2-NEXT:    movdqa %xmm2, (%rcx)
118; SSE2-NEXT:    retq
119;
120; SSE42-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
121; SSE42:       # %bb.0:
122; SSE42-NEXT:    movdqa (%rdi), %xmm0
123; SSE42-NEXT:    paddb (%rsi), %xmm0
124; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
125; SSE42-NEXT:    paddb (%rdx), %xmm0
126; SSE42-NEXT:    movdqa %xmm0, (%rcx)
127; SSE42-NEXT:    retq
128;
129; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
130; AVX:       # %bb.0:
131; AVX-NEXT:    vmovdqa (%rdi), %xmm0
132; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
133; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
134; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
135; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
136; AVX-NEXT:    retq
137;
138; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
139; AVX2:       # %bb.0:
140; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
141; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
142; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
143; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
144; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
145; AVX2-NEXT:    vzeroupper
146; AVX2-NEXT:    retq
147;
148; AVX512F-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
149; AVX512F:       # %bb.0:
150; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
151; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
152; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
153; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
154; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
155; AVX512F-NEXT:    vzeroupper
156; AVX512F-NEXT:    retq
157;
158; AVX512DQ-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
159; AVX512DQ:       # %bb.0:
160; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
161; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
162; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
163; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
164; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
165; AVX512DQ-NEXT:    vzeroupper
166; AVX512DQ-NEXT:    retq
167;
168; AVX512BW-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
169; AVX512BW:       # %bb.0:
170; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
171; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
172; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
173; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
174; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
175; AVX512BW-NEXT:    vzeroupper
176; AVX512BW-NEXT:    retq
177  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
178  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
179  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
180  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
181  %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.aextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
182  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
183  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
184  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
185  ret void
186}
187
188define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
189; SSE2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
190; SSE2:       # %bb.0:
191; SSE2-NEXT:    movdqa (%rdi), %xmm0
192; SSE2-NEXT:    paddb (%rsi), %xmm0
193; SSE2-NEXT:    pxor %xmm1, %xmm1
194; SSE2-NEXT:    movdqa %xmm0, %xmm2
195; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
196; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
197; SSE2-NEXT:    pand %xmm3, %xmm2
198; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
199; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
200; SSE2-NEXT:    pandn %xmm0, %xmm3
201; SSE2-NEXT:    por %xmm2, %xmm3
202; SSE2-NEXT:    packuswb %xmm3, %xmm3
203; SSE2-NEXT:    paddb (%rdx), %xmm3
204; SSE2-NEXT:    movdqa %xmm3, (%rcx)
205; SSE2-NEXT:    retq
206;
207; SSE42-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
208; SSE42:       # %bb.0:
209; SSE42-NEXT:    movdqa (%rdi), %xmm0
210; SSE42-NEXT:    paddb (%rsi), %xmm0
211; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
212; SSE42-NEXT:    paddb (%rdx), %xmm0
213; SSE42-NEXT:    movdqa %xmm0, (%rcx)
214; SSE42-NEXT:    retq
215;
216; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vmovdqa (%rdi), %xmm0
219; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
220; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
221; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
222; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
223; AVX-NEXT:    retq
224;
225; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
226; AVX2:       # %bb.0:
227; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
228; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
229; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
230; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
231; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
232; AVX2-NEXT:    vzeroupper
233; AVX2-NEXT:    retq
234;
235; AVX512F-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
236; AVX512F:       # %bb.0:
237; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
238; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
239; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
240; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
241; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
242; AVX512F-NEXT:    vzeroupper
243; AVX512F-NEXT:    retq
244;
245; AVX512DQ-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
246; AVX512DQ:       # %bb.0:
247; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
248; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
249; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
250; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
251; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
252; AVX512DQ-NEXT:    vzeroupper
253; AVX512DQ-NEXT:    retq
254;
255; AVX512BW-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
256; AVX512BW:       # %bb.0:
257; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
258; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
259; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
260; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
261; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
262; AVX512BW-NEXT:    vzeroupper
263; AVX512BW-NEXT:    retq
264  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
265  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
266  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
267  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
268  %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.aextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
269  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
270  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
271  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
272  ret void
273}
274
275define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
276; SSE2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
277; SSE2:       # %bb.0:
278; SSE2-NEXT:    movdqa (%rdi), %xmm0
279; SSE2-NEXT:    paddb (%rsi), %xmm0
280; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
281; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
282; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
283; SSE2-NEXT:    paddb (%rdx), %xmm0
284; SSE2-NEXT:    movdqa %xmm0, (%rcx)
285; SSE2-NEXT:    retq
286;
287; SSE42-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
288; SSE42:       # %bb.0:
289; SSE42-NEXT:    movdqa (%rdi), %xmm0
290; SSE42-NEXT:    paddb (%rsi), %xmm0
291; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
292; SSE42-NEXT:    paddb (%rdx), %xmm0
293; SSE42-NEXT:    movdqa %xmm0, (%rcx)
294; SSE42-NEXT:    retq
295;
296; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vmovdqa (%rdi), %xmm0
299; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
300; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
301; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
302; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
303; AVX-NEXT:    retq
304;
305; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
306; AVX2:       # %bb.0:
307; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
308; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
309; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
310; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
311; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
312; AVX2-NEXT:    vzeroupper
313; AVX2-NEXT:    retq
314;
315; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
316; AVX512F:       # %bb.0:
317; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
318; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
319; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
320; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
321; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
322; AVX512F-NEXT:    vzeroupper
323; AVX512F-NEXT:    retq
324;
325; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
326; AVX512DQ:       # %bb.0:
327; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
328; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
329; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
330; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
331; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
332; AVX512DQ-NEXT:    vzeroupper
333; AVX512DQ-NEXT:    retq
334;
335; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
336; AVX512BW:       # %bb.0:
337; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
338; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
339; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
340; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
341; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
342; AVX512BW-NEXT:    vzeroupper
343; AVX512BW-NEXT:    retq
344  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
345  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
346  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
347  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
348  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
349  %out.bytevec = bitcast <4 x i16> %broadcast.of.aextinreg to <8 x i8>
350  %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
351  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
352  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
353  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
354  ret void
355}
356
357define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
358; SSE2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
359; SSE2:       # %bb.0:
360; SSE2-NEXT:    movdqa (%rdi), %xmm0
361; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
362; SSE2-NEXT:    paddb (%rsi), %xmm0
363; SSE2-NEXT:    paddb 16(%rsi), %xmm1
364; SSE2-NEXT:    psrlw $8, %xmm1
365; SSE2-NEXT:    packuswb %xmm1, %xmm1
366; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
367; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
368; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
369; SSE2-NEXT:    paddb (%rdx), %xmm0
370; SSE2-NEXT:    movdqa %xmm0, (%rcx)
371; SSE2-NEXT:    retq
372;
373; SSE42-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
374; SSE42:       # %bb.0:
375; SSE42-NEXT:    movdqa (%rdi), %xmm0
376; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
377; SSE42-NEXT:    paddb (%rsi), %xmm0
378; SSE42-NEXT:    paddb 16(%rsi), %xmm1
379; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
380; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
382; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
383; SSE42-NEXT:    paddb (%rdx), %xmm0
384; SSE42-NEXT:    movdqa %xmm0, (%rcx)
385; SSE42-NEXT:    retq
386;
387; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
388; AVX:       # %bb.0:
389; AVX-NEXT:    vmovdqa (%rdi), %xmm0
390; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
391; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
392; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
393; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
394; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
395; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
396; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
397; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
398; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
399; AVX-NEXT:    retq
400;
401; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
402; AVX2:       # %bb.0:
403; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
404; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
405; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
406; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
407; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
408; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
409; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
410; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
411; AVX2-NEXT:    vzeroupper
412; AVX2-NEXT:    retq
413;
414; AVX512F-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
415; AVX512F:       # %bb.0:
416; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
417; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
418; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
419; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
420; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
421; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
422; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
423; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
424; AVX512F-NEXT:    vzeroupper
425; AVX512F-NEXT:    retq
426;
427; AVX512DQ-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
428; AVX512DQ:       # %bb.0:
429; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
430; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
431; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
432; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
433; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm0
434; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
435; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
436; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
437; AVX512DQ-NEXT:    vzeroupper
438; AVX512DQ-NEXT:    retq
439;
440; AVX512BW-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
441; AVX512BW:       # %bb.0:
442; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
443; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
444; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
445; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
446; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm0
447; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
448; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
449; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
450; AVX512BW-NEXT:    vzeroupper
451; AVX512BW-NEXT:    retq
452  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
453  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
454  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
455  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
456  %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
457  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
458  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
459  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
460  ret void
461}
462
463define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
464; SSE2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
465; SSE2:       # %bb.0:
466; SSE2-NEXT:    movdqa (%rdi), %xmm0
467; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
468; SSE2-NEXT:    paddb (%rsi), %xmm0
469; SSE2-NEXT:    paddb 16(%rsi), %xmm1
470; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
471; SSE2-NEXT:    pand %xmm2, %xmm1
472; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
473; SSE2-NEXT:    pandn %xmm0, %xmm2
474; SSE2-NEXT:    por %xmm1, %xmm2
475; SSE2-NEXT:    paddb (%rdx), %xmm2
476; SSE2-NEXT:    movdqa %xmm2, (%rcx)
477; SSE2-NEXT:    retq
478;
479; SSE42-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
480; SSE42:       # %bb.0:
481; SSE42-NEXT:    movdqa (%rdi), %xmm0
482; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
483; SSE42-NEXT:    paddb 16(%rsi), %xmm1
484; SSE42-NEXT:    paddb (%rsi), %xmm0
485; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
486; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
487; SSE42-NEXT:    paddb (%rdx), %xmm0
488; SSE42-NEXT:    movdqa %xmm0, (%rcx)
489; SSE42-NEXT:    retq
490;
491; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
492; AVX:       # %bb.0:
493; AVX-NEXT:    vmovdqa (%rdi), %xmm0
494; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
495; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
496; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
497; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
498; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
499; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
500; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
501; AVX-NEXT:    retq
502;
503; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
504; AVX2:       # %bb.0:
505; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
506; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
507; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
508; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
509; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
510; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
511; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
512; AVX2-NEXT:    vzeroupper
513; AVX2-NEXT:    retq
514;
515; AVX512F-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
516; AVX512F:       # %bb.0:
517; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
518; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
519; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
520; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
521; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
522; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
523; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
524; AVX512F-NEXT:    vzeroupper
525; AVX512F-NEXT:    retq
526;
527; AVX512DQ-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
528; AVX512DQ:       # %bb.0:
529; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
530; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
531; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
532; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
533; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
534; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
535; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
536; AVX512DQ-NEXT:    vzeroupper
537; AVX512DQ-NEXT:    retq
538;
539; AVX512BW-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
540; AVX512BW:       # %bb.0:
541; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
542; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
543; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
544; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
545; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
546; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
547; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
548; AVX512BW-NEXT:    vzeroupper
549; AVX512BW-NEXT:    retq
550  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
551  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
552  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
553  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
554  %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
555  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
556  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
557  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
558  ret void
559}
560
561define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
562; SSE2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
563; SSE2:       # %bb.0:
564; SSE2-NEXT:    movdqa (%rdi), %xmm0
565; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
566; SSE2-NEXT:    paddb (%rsi), %xmm0
567; SSE2-NEXT:    paddb 16(%rsi), %xmm1
568; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
569; SSE2-NEXT:    pand %xmm2, %xmm1
570; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
571; SSE2-NEXT:    pandn %xmm0, %xmm2
572; SSE2-NEXT:    por %xmm1, %xmm2
573; SSE2-NEXT:    paddb (%rdx), %xmm2
574; SSE2-NEXT:    movdqa %xmm2, (%rcx)
575; SSE2-NEXT:    retq
576;
577; SSE42-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
578; SSE42:       # %bb.0:
579; SSE42-NEXT:    movdqa (%rdi), %xmm0
580; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
581; SSE42-NEXT:    paddb 16(%rsi), %xmm1
582; SSE42-NEXT:    paddb (%rsi), %xmm0
583; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
584; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
585; SSE42-NEXT:    paddb (%rdx), %xmm0
586; SSE42-NEXT:    movdqa %xmm0, (%rcx)
587; SSE42-NEXT:    retq
588;
589; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
590; AVX:       # %bb.0:
591; AVX-NEXT:    vmovdqa (%rdi), %xmm0
592; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
593; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
594; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
595; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
596; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
597; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
598; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
599; AVX-NEXT:    retq
600;
601; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
602; AVX2:       # %bb.0:
603; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
604; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
605; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
606; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
607; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
608; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
609; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
610; AVX2-NEXT:    vzeroupper
611; AVX2-NEXT:    retq
612;
613; AVX512F-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
614; AVX512F:       # %bb.0:
615; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
616; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
617; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
618; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
619; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
620; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
621; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
622; AVX512F-NEXT:    vzeroupper
623; AVX512F-NEXT:    retq
624;
625; AVX512DQ-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
626; AVX512DQ:       # %bb.0:
627; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
628; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
629; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
630; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
631; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
632; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
633; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
634; AVX512DQ-NEXT:    vzeroupper
635; AVX512DQ-NEXT:    retq
636;
637; AVX512BW-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
638; AVX512BW:       # %bb.0:
639; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
640; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
641; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
642; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
643; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
644; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
645; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
646; AVX512BW-NEXT:    vzeroupper
647; AVX512BW-NEXT:    retq
648  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
649  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
650  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
651  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
652  %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
653  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
654  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
655  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
656  ret void
657}
658
659define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
660; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
661; SSE2:       # %bb.0:
662; SSE2-NEXT:    movdqa (%rdi), %xmm0
663; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
664; SSE2-NEXT:    paddb 16(%rsi), %xmm1
665; SSE2-NEXT:    paddb (%rsi), %xmm0
666; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
667; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
668; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
669; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
670; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
671; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672; SSE2-NEXT:    paddb (%rdx), %xmm0
673; SSE2-NEXT:    movdqa %xmm0, (%rcx)
674; SSE2-NEXT:    retq
675;
676; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
677; SSE42:       # %bb.0:
678; SSE42-NEXT:    movdqa (%rdi), %xmm0
679; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
680; SSE42-NEXT:    paddb (%rsi), %xmm0
681; SSE42-NEXT:    paddb 16(%rsi), %xmm1
682; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
683; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
684; SSE42-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685; SSE42-NEXT:    paddb (%rdx), %xmm0
686; SSE42-NEXT:    movdqa %xmm0, (%rcx)
687; SSE42-NEXT:    retq
688;
689; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
690; AVX:       # %bb.0:
691; AVX-NEXT:    vmovdqa (%rdi), %xmm0
692; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
693; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
694; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
695; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
696; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
697; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
698; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
699; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
700; AVX-NEXT:    retq
701;
702; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
703; AVX2:       # %bb.0:
704; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
705; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
706; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
707; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
708; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
709; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
710; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
711; AVX2-NEXT:    vzeroupper
712; AVX2-NEXT:    retq
713;
714; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
715; AVX512F:       # %bb.0:
716; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
717; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
718; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
719; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
720; AVX512F-NEXT:    vmovd %xmm0, %eax
721; AVX512F-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm0
722; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
723; AVX512F-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
724; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
725; AVX512F-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
726; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
727; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
728; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
729; AVX512F-NEXT:    vzeroupper
730; AVX512F-NEXT:    retq
731;
732; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
733; AVX512DQ:       # %bb.0:
734; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
735; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
736; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
737; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
738; AVX512DQ-NEXT:    vmovd %xmm0, %eax
739; AVX512DQ-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm0
740; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
741; AVX512DQ-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
742; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
743; AVX512DQ-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
744; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
745; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
746; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
747; AVX512DQ-NEXT:    vzeroupper
748; AVX512DQ-NEXT:    retq
749;
750; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751; AVX512BW-SLOW:       # %bb.0:
752; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
753; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
755; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
756; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
757; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
758; AVX512BW-SLOW-NEXT:    vzeroupper
759; AVX512BW-SLOW-NEXT:    retq
760;
761; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
762; AVX512BW-FAST:       # %bb.0:
763; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
764; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
766; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
767; AVX512BW-FAST-NEXT:    vmovd %xmm0, %eax
768; AVX512BW-FAST-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
769; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
770; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
771; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
772; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
773; AVX512BW-FAST-NEXT:    vzeroupper
774; AVX512BW-FAST-NEXT:    retq
775  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
776  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
777  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
778  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
779  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
780  %out.bytevec = bitcast <8 x i16> %broadcast.of.aextinreg to <16 x i8>
781  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
782  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
783  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
784  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
785  ret void
786}
787
788define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
789; SSE2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
790; SSE2:       # %bb.0:
791; SSE2-NEXT:    movdqa (%rdi), %xmm0
792; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
793; SSE2-NEXT:    paddb (%rsi), %xmm0
794; SSE2-NEXT:    paddb 16(%rsi), %xmm1
795; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
796; SSE2-NEXT:    pand %xmm2, %xmm1
797; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
798; SSE2-NEXT:    pandn %xmm0, %xmm2
799; SSE2-NEXT:    por %xmm1, %xmm2
800; SSE2-NEXT:    paddb (%rdx), %xmm2
801; SSE2-NEXT:    movdqa %xmm2, (%rcx)
802; SSE2-NEXT:    retq
803;
804; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
805; SSE42:       # %bb.0:
806; SSE42-NEXT:    movdqa (%rdi), %xmm0
807; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
808; SSE42-NEXT:    paddb 16(%rsi), %xmm1
809; SSE42-NEXT:    paddb (%rsi), %xmm0
810; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
811; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
812; SSE42-NEXT:    paddb (%rdx), %xmm0
813; SSE42-NEXT:    movdqa %xmm0, (%rcx)
814; SSE42-NEXT:    retq
815;
816; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
817; AVX:       # %bb.0:
818; AVX-NEXT:    vmovdqa (%rdi), %xmm0
819; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
820; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
821; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
822; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
823; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
824; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
825; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
826; AVX-NEXT:    retq
827;
828; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
829; AVX2:       # %bb.0:
830; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
831; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
832; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
833; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
834; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
835; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
836; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
837; AVX2-NEXT:    vzeroupper
838; AVX2-NEXT:    retq
839;
840; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
841; AVX512F:       # %bb.0:
842; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
843; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
844; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
845; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
846; AVX512F-NEXT:    vmovd %xmm0, %eax
847; AVX512F-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm0
848; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
849; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
850; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
851; AVX512F-NEXT:    vzeroupper
852; AVX512F-NEXT:    retq
853;
854; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
855; AVX512DQ:       # %bb.0:
856; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
857; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
858; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
859; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
860; AVX512DQ-NEXT:    vmovd %xmm0, %eax
861; AVX512DQ-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm0
862; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
863; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
864; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
865; AVX512DQ-NEXT:    vzeroupper
866; AVX512DQ-NEXT:    retq
867;
868; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
869; AVX512BW-SLOW:       # %bb.0:
870; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
871; AVX512BW-SLOW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
873; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
874; AVX512BW-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
875; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
877; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
878; AVX512BW-SLOW-NEXT:    vzeroupper
879; AVX512BW-SLOW-NEXT:    retq
880;
881; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
882; AVX512BW-FAST:       # %bb.0:
883; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
884; AVX512BW-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
886; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm1
887; AVX512BW-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
888; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
890; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
891; AVX512BW-FAST-NEXT:    vzeroupper
892; AVX512BW-FAST-NEXT:    retq
893  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
894  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
895  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
896  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
897  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
898  %out.bytevec = bitcast <8 x i16> %broadcast.of.aextinreg to <16 x i8>
899  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
900  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
901  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
902  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
903  ret void
904}
905
906define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
907; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
908; SSE2:       # %bb.0:
909; SSE2-NEXT:    movdqa (%rdi), %xmm0
910; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
911; SSE2-NEXT:    paddb (%rsi), %xmm0
912; SSE2-NEXT:    paddb 16(%rsi), %xmm1
913; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
914; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
915; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
916; SSE2-NEXT:    paddb (%rdx), %xmm0
917; SSE2-NEXT:    movdqa %xmm0, (%rcx)
918; SSE2-NEXT:    retq
919;
920; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
921; SSE42:       # %bb.0:
922; SSE42-NEXT:    movdqa (%rdi), %xmm0
923; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
924; SSE42-NEXT:    paddb 16(%rsi), %xmm1
925; SSE42-NEXT:    paddb (%rsi), %xmm0
926; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
927; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
928; SSE42-NEXT:    paddb (%rdx), %xmm0
929; SSE42-NEXT:    movdqa %xmm0, (%rcx)
930; SSE42-NEXT:    retq
931;
932; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
933; AVX:       # %bb.0:
934; AVX-NEXT:    vmovdqa (%rdi), %xmm0
935; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
936; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
937; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
938; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
939; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
940; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
941; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
942; AVX-NEXT:    retq
943;
944; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
945; AVX2-SLOW:       # %bb.0:
946; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
947; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
948; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
949; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
950; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
951; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
952; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
953; AVX2-SLOW-NEXT:    vzeroupper
954; AVX2-SLOW-NEXT:    retq
955;
956; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
957; AVX2-FAST-PERLANE:       # %bb.0:
958; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
959; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
960; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
961; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd %xmm0, %xmm0
962; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
963; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
964; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
965; AVX2-FAST-PERLANE-NEXT:    vzeroupper
966; AVX2-FAST-PERLANE-NEXT:    retq
967;
968; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
969; AVX2-FAST:       # %bb.0:
970; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
971; AVX2-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
972; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
973; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
974; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
975; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
976; AVX2-FAST-NEXT:    vzeroupper
977; AVX2-FAST-NEXT:    retq
978;
979; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
980; AVX512F:       # %bb.0:
981; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
982; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
983; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
984; AVX512F-NEXT:    vpermd %ymm1, %ymm0, %ymm0
985; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
986; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
987; AVX512F-NEXT:    vzeroupper
988; AVX512F-NEXT:    retq
989;
990; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
991; AVX512DQ:       # %bb.0:
992; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
993; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
994; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
995; AVX512DQ-NEXT:    vpermd %ymm1, %ymm0, %ymm0
996; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
997; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
998; AVX512DQ-NEXT:    vzeroupper
999; AVX512DQ-NEXT:    retq
1000;
1001; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
1002; AVX512BW:       # %bb.0:
1003; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1004; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
1005; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1006; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1007; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1008; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1009; AVX512BW-NEXT:    vzeroupper
1010; AVX512BW-NEXT:    retq
1011  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1012  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1013  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1014  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
1015  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
1016  %out.bytevec = bitcast <4 x i32> %broadcast.of.aextinreg to <16 x i8>
1017  %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1018  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1019  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1020  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1021  ret void
1022}
1023
1024define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1025; SSE2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1026; SSE2:       # %bb.0:
1027; SSE2-NEXT:    movdqa (%rdi), %xmm0
1028; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1029; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1030; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1031; SSE2-NEXT:    paddb (%rsi), %xmm0
1032; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1033; SSE2-NEXT:    psrlw $8, %xmm1
1034; SSE2-NEXT:    packuswb %xmm1, %xmm1
1035; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1036; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1037; SSE2-NEXT:    movdqa %xmm0, %xmm3
1038; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1039; SSE2-NEXT:    psrlw $8, %xmm2
1040; SSE2-NEXT:    packuswb %xmm2, %xmm2
1041; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1042; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1043; SSE2-NEXT:    paddb (%rdx), %xmm3
1044; SSE2-NEXT:    movdqa %xmm3, (%rcx)
1045; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1046; SSE2-NEXT:    retq
1047;
1048; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1049; SSE42:       # %bb.0:
1050; SSE42-NEXT:    movdqa (%rdi), %xmm0
1051; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1052; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1053; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1054; SSE42-NEXT:    paddb (%rsi), %xmm0
1055; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1056; SSE42-NEXT:    movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
1057; SSE42-NEXT:    pshufb %xmm3, %xmm1
1058; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1059; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1060; SSE42-NEXT:    movdqa %xmm0, %xmm4
1061; SSE42-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1062; SSE42-NEXT:    pshufb %xmm3, %xmm2
1063; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1064; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1065; SSE42-NEXT:    paddb (%rdx), %xmm4
1066; SSE42-NEXT:    movdqa %xmm4, (%rcx)
1067; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1068; SSE42-NEXT:    retq
1069;
1070; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1071; AVX:       # %bb.0:
1072; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1073; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1074; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1075; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1076; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1077; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1078; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
1079; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1080; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1081; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1082; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1083; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1084; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1085; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1086; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1087; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1088; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1089; AVX-NEXT:    retq
1090;
1091; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1092; AVX2:       # %bb.0:
1093; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1094; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1095; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1096; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1097; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1098; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
1099; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1100; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1101; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1102; AVX2-NEXT:    vzeroupper
1103; AVX2-NEXT:    retq
1104;
1105; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1106; AVX512F:       # %bb.0:
1107; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1108; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1109; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1110; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1111; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1112; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
1113; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1114; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1115; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1116; AVX512F-NEXT:    vzeroupper
1117; AVX512F-NEXT:    retq
1118;
1119; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1120; AVX512DQ:       # %bb.0:
1121; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1122; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1123; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1124; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1125; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1126; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
1127; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1128; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1129; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1130; AVX512DQ-NEXT:    vzeroupper
1131; AVX512DQ-NEXT:    retq
1132;
1133; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1134; AVX512BW:       # %bb.0:
1135; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1136; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1137; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1138; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1139; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
1140; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1141; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1142; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1143; AVX512BW-NEXT:    vzeroupper
1144; AVX512BW-NEXT:    retq
1145  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1146  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1147  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1148  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
1149  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1150  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1151  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1152  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1153  ret void
1154}
1155
1156define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1157; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1158; SSE2:       # %bb.0:
1159; SSE2-NEXT:    movdqa (%rdi), %xmm0
1160; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1161; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1162; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1163; SSE2-NEXT:    paddb (%rsi), %xmm0
1164; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1165; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1166; SSE2-NEXT:    pand %xmm3, %xmm1
1167; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1168; SSE2-NEXT:    pand %xmm3, %xmm2
1169; SSE2-NEXT:    pandn %xmm0, %xmm3
1170; SSE2-NEXT:    por %xmm3, %xmm1
1171; SSE2-NEXT:    por %xmm2, %xmm3
1172; SSE2-NEXT:    paddb 16(%rdx), %xmm3
1173; SSE2-NEXT:    paddb (%rdx), %xmm1
1174; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1175; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
1176; SSE2-NEXT:    retq
1177;
1178; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1179; SSE42:       # %bb.0:
1180; SSE42-NEXT:    movdqa (%rdi), %xmm0
1181; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1182; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1183; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1184; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1185; SSE42-NEXT:    paddb (%rsi), %xmm0
1186; SSE42-NEXT:    movdqa %xmm0, %xmm3
1187; SSE42-NEXT:    palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1188; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1189; SSE42-NEXT:    pshufb %xmm1, %xmm3
1190; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1191; SSE42-NEXT:    pshufb %xmm1, %xmm0
1192; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1193; SSE42-NEXT:    paddb (%rdx), %xmm3
1194; SSE42-NEXT:    movdqa %xmm3, (%rcx)
1195; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1196; SSE42-NEXT:    retq
1197;
1198; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1199; AVX:       # %bb.0:
1200; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1201; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1202; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1203; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1204; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1205; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1206; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1207; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1208; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1209; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1210; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1211; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1212; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1213; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1214; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1215; AVX-NEXT:    retq
1216;
1217; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1218; AVX2:       # %bb.0:
1219; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1220; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1221; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1222; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1223; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
1224; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1225; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1226; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1227; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1228; AVX2-NEXT:    vzeroupper
1229; AVX2-NEXT:    retq
1230;
1231; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1232; AVX512F:       # %bb.0:
1233; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1234; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1235; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1236; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1237; AVX512F-NEXT:    vpbroadcastd %xmm1, %ymm1
1238; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1239; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1240; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1241; AVX512F-NEXT:    vzeroupper
1242; AVX512F-NEXT:    retq
1243;
1244; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1245; AVX512DQ:       # %bb.0:
1246; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1247; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1248; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1249; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1250; AVX512DQ-NEXT:    vpbroadcastd %xmm1, %ymm1
1251; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1252; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1253; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1254; AVX512DQ-NEXT:    vzeroupper
1255; AVX512DQ-NEXT:    retq
1256;
1257; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1258; AVX512BW:       # %bb.0:
1259; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1260; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1261; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1262; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm0
1263; AVX512BW-NEXT:    movl $286331153, %eax # imm = 0x11111111
1264; AVX512BW-NEXT:    kmovd %eax, %k1
1265; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
1266; AVX512BW-NEXT:    vpaddb (%rdx), %zmm1, %zmm0
1267; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1268; AVX512BW-NEXT:    vzeroupper
1269; AVX512BW-NEXT:    retq
1270  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1271  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1272  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1273  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
1274  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1275  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1276  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1277  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1278  ret void
1279}
1280
1281define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1282; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1283; SSE2:       # %bb.0:
1284; SSE2-NEXT:    movdqa (%rdi), %xmm0
1285; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1286; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1287; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1288; SSE2-NEXT:    paddb (%rsi), %xmm0
1289; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1290; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1291; SSE2-NEXT:    pand %xmm3, %xmm1
1292; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1293; SSE2-NEXT:    pand %xmm3, %xmm2
1294; SSE2-NEXT:    pandn %xmm0, %xmm3
1295; SSE2-NEXT:    por %xmm3, %xmm1
1296; SSE2-NEXT:    por %xmm2, %xmm3
1297; SSE2-NEXT:    paddb 16(%rdx), %xmm3
1298; SSE2-NEXT:    paddb (%rdx), %xmm1
1299; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1300; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
1301; SSE2-NEXT:    retq
1302;
1303; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1304; SSE42:       # %bb.0:
1305; SSE42-NEXT:    movdqa (%rdi), %xmm0
1306; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1307; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1308; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1309; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1310; SSE42-NEXT:    paddb (%rsi), %xmm0
1311; SSE42-NEXT:    movdqa %xmm0, %xmm3
1312; SSE42-NEXT:    palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1313; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1314; SSE42-NEXT:    pshufb %xmm1, %xmm3
1315; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1316; SSE42-NEXT:    pshufb %xmm1, %xmm0
1317; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1318; SSE42-NEXT:    paddb (%rdx), %xmm3
1319; SSE42-NEXT:    movdqa %xmm3, (%rcx)
1320; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1321; SSE42-NEXT:    retq
1322;
1323; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1324; AVX:       # %bb.0:
1325; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1326; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1327; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1328; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1329; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1330; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1331; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1332; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1333; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1334; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1335; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1336; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1337; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1338; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1339; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1340; AVX-NEXT:    retq
1341;
1342; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1343; AVX2:       # %bb.0:
1344; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1345; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1346; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1347; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1348; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
1349; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1350; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1351; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1352; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1353; AVX2-NEXT:    vzeroupper
1354; AVX2-NEXT:    retq
1355;
1356; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1357; AVX512F:       # %bb.0:
1358; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1359; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1360; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1361; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1362; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
1363; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1364; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1365; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1366; AVX512F-NEXT:    vzeroupper
1367; AVX512F-NEXT:    retq
1368;
1369; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1370; AVX512DQ:       # %bb.0:
1371; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1372; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1373; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1374; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1375; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
1376; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
1377; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
1378; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1379; AVX512DQ-NEXT:    vzeroupper
1380; AVX512DQ-NEXT:    retq
1381;
1382; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1383; AVX512BW:       # %bb.0:
1384; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1385; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1386; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1387; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm0
1388; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
1389; AVX512BW-NEXT:    kmovd %eax, %k1
1390; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
1391; AVX512BW-NEXT:    vpaddb (%rdx), %zmm1, %zmm0
1392; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1393; AVX512BW-NEXT:    vzeroupper
1394; AVX512BW-NEXT:    retq
1395  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1396  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1397  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1398  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1399  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1400  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1401  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1402  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1403  ret void
1404}
1405
1406define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1407; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1408; SSE2:       # %bb.0:
1409; SSE2-NEXT:    movdqa (%rdi), %xmm0
1410; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1411; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1412; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1413; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1414; SSE2-NEXT:    paddb (%rsi), %xmm0
1415; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1416; SSE2-NEXT:    pand %xmm3, %xmm1
1417; SSE2-NEXT:    pand %xmm3, %xmm2
1418; SSE2-NEXT:    pandn %xmm0, %xmm3
1419; SSE2-NEXT:    por %xmm3, %xmm1
1420; SSE2-NEXT:    por %xmm3, %xmm2
1421; SSE2-NEXT:    paddb 16(%rdx), %xmm2
1422; SSE2-NEXT:    paddb (%rdx), %xmm1
1423; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1424; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
1425; SSE2-NEXT:    retq
1426;
1427; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1428; SSE42:       # %bb.0:
1429; SSE42-NEXT:    movdqa (%rdi), %xmm1
1430; SSE42-NEXT:    movdqa 32(%rdi), %xmm2
1431; SSE42-NEXT:    movdqa 48(%rdi), %xmm3
1432; SSE42-NEXT:    paddb 48(%rsi), %xmm3
1433; SSE42-NEXT:    paddb 32(%rsi), %xmm2
1434; SSE42-NEXT:    paddb (%rsi), %xmm1
1435; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1436; SSE42-NEXT:    movdqa %xmm1, %xmm4
1437; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm4
1438; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1439; SSE42-NEXT:    paddb 16(%rdx), %xmm1
1440; SSE42-NEXT:    paddb (%rdx), %xmm4
1441; SSE42-NEXT:    movdqa %xmm4, (%rcx)
1442; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
1443; SSE42-NEXT:    retq
1444;
1445; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1446; AVX:       # %bb.0:
1447; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1448; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1449; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1450; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1451; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1452; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1453; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615]
1454; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm1
1455; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
1456; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1457; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1458; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1459; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1460; AVX-NEXT:    retq
1461;
1462; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1463; AVX2:       # %bb.0:
1464; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1465; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1466; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1467; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1468; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1469; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
1470; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1471; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1472; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1473; AVX2-NEXT:    vzeroupper
1474; AVX2-NEXT:    retq
1475;
1476; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1477; AVX512F:       # %bb.0:
1478; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1479; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1480; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1481; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1482; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
1485; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1486; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
1487; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1488; AVX512F-NEXT:    vzeroupper
1489; AVX512F-NEXT:    retq
1490;
1491; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1492; AVX512DQ:       # %bb.0:
1493; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1494; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
1495; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1496; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1497; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
1500; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1501; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
1502; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1503; AVX512DQ-NEXT:    vzeroupper
1504; AVX512DQ-NEXT:    retq
1505;
1506; AVX512BW-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1507; AVX512BW:       # %bb.0:
1508; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1509; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1510; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1511; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1512; AVX512BW-NEXT:    movl $65537, %eax # imm = 0x10001
1513; AVX512BW-NEXT:    kmovd %eax, %k1
1514; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
1515; AVX512BW-NEXT:    vpaddb (%rdx), %zmm1, %zmm0
1516; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1517; AVX512BW-NEXT:    vzeroupper
1518; AVX512BW-NEXT:    retq
1519  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1520  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1521  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1522  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1523  %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1524  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1525  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1526  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1527  ret void
1528}
1529
1530define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1531; SSE2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1532; SSE2:       # %bb.0:
1533; SSE2-NEXT:    movdqa (%rdi), %xmm0
1534; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1535; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1536; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1537; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1538; SSE2-NEXT:    paddb (%rsi), %xmm0
1539; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1540; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1541; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1542; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1543; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1544; SSE2-NEXT:    movdqa %xmm0, %xmm3
1545; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1546; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7]
1547; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1548; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1549; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1550; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1551; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1552; SSE2-NEXT:    paddb (%rdx), %xmm3
1553; SSE2-NEXT:    movdqa %xmm3, (%rcx)
1554; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1555; SSE2-NEXT:    retq
1556;
1557; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1558; SSE42:       # %bb.0:
1559; SSE42-NEXT:    movdqa (%rdi), %xmm0
1560; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1561; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1562; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1563; SSE42-NEXT:    paddb (%rsi), %xmm0
1564; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1565; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1566; SSE42-NEXT:    pshufb %xmm3, %xmm1
1567; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1568; SSE42-NEXT:    movdqa %xmm0, %xmm4
1569; SSE42-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1570; SSE42-NEXT:    pshufb %xmm3, %xmm2
1571; SSE42-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1572; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1573; SSE42-NEXT:    paddb (%rdx), %xmm4
1574; SSE42-NEXT:    movdqa %xmm4, (%rcx)
1575; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1576; SSE42-NEXT:    retq
1577;
1578; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1579; AVX:       # %bb.0:
1580; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1581; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1582; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1583; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1584; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1585; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1586; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1587; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1588; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1589; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1590; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1591; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1592; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1593; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1594; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1595; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1596; AVX-NEXT:    retq
1597;
1598; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1599; AVX2:       # %bb.0:
1600; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1601; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1602; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1603; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1604; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
1605; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1606; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1607; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1608; AVX2-NEXT:    vzeroupper
1609; AVX2-NEXT:    retq
1610;
1611; AVX512F-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1612; AVX512F:       # %bb.0:
1613; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1614; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1615; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1616; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1617; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm1
1618; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1619; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1620; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1621; AVX512F-NEXT:    vzeroupper
1622; AVX512F-NEXT:    retq
1623;
1624; AVX512DQ-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1625; AVX512DQ:       # %bb.0:
1626; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1627; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1628; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1629; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1630; AVX512DQ-NEXT:    vpbroadcastw %xmm1, %ymm1
1631; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1632; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1633; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1634; AVX512DQ-NEXT:    vzeroupper
1635; AVX512DQ-NEXT:    retq
1636;
1637; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1638; AVX512BW:       # %bb.0:
1639; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1640; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1641; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
1642; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
1643; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
1644; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1645; AVX512BW-NEXT:    vzeroupper
1646; AVX512BW-NEXT:    retq
1647  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1648  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1649  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1650  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1651  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
1652  %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8>
1653  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1654  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1655  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1656  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1657  ret void
1658}
1659
1660define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1661; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1662; SSE2:       # %bb.0:
1663; SSE2-NEXT:    movdqa (%rdi), %xmm0
1664; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1665; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1666; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1667; SSE2-NEXT:    paddb (%rsi), %xmm0
1668; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1669; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
1670; SSE2-NEXT:    pand %xmm3, %xmm1
1671; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1672; SSE2-NEXT:    pand %xmm3, %xmm2
1673; SSE2-NEXT:    pandn %xmm0, %xmm3
1674; SSE2-NEXT:    por %xmm3, %xmm1
1675; SSE2-NEXT:    por %xmm2, %xmm3
1676; SSE2-NEXT:    paddb 16(%rdx), %xmm3
1677; SSE2-NEXT:    paddb (%rdx), %xmm1
1678; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1679; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
1680; SSE2-NEXT:    retq
1681;
1682; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1683; SSE42:       # %bb.0:
1684; SSE42-NEXT:    movdqa (%rdi), %xmm0
1685; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1686; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1687; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1688; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1689; SSE42-NEXT:    paddb (%rsi), %xmm0
1690; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1691; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1692; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1693; SSE42-NEXT:    paddb 16(%rdx), %xmm2
1694; SSE42-NEXT:    paddb (%rdx), %xmm1
1695; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1696; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
1697; SSE42-NEXT:    retq
1698;
1699; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1700; AVX:       # %bb.0:
1701; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1702; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1703; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1704; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1705; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1706; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1707; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1708; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1709; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1710; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1711; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1712; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1713; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1714; AVX-NEXT:    retq
1715;
1716; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1717; AVX2:       # %bb.0:
1718; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1719; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1720; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1721; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1722; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
1723; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1724; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1725; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1726; AVX2-NEXT:    vzeroupper
1727; AVX2-NEXT:    retq
1728;
1729; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1730; AVX512F:       # %bb.0:
1731; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
1732; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1733; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
1734; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1735; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
1736; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1737; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1738; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1739; AVX512F-NEXT:    vzeroupper
1740; AVX512F-NEXT:    retq
1741;
1742; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1743; AVX512DQ:       # %bb.0:
1744; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
1745; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1746; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1747; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1748; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
1749; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1750; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1751; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1752; AVX512DQ-NEXT:    vzeroupper
1753; AVX512DQ-NEXT:    retq
1754;
1755; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1756; AVX512BW:       # %bb.0:
1757; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1758; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1759; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1760; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
1761; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1762; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
1763; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1764; AVX512BW-NEXT:    vzeroupper
1765; AVX512BW-NEXT:    retq
1766  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1767  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1768  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1769  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1770  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
1771  %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8>
1772  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1773  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1774  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1775  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1776  ret void
1777}
1778
1779define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1780; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1781; SSE2:       # %bb.0:
1782; SSE2-NEXT:    movdqa (%rdi), %xmm0
1783; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1784; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1785; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1786; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1787; SSE2-NEXT:    paddb (%rsi), %xmm0
1788; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
1789; SSE2-NEXT:    pand %xmm3, %xmm1
1790; SSE2-NEXT:    pand %xmm3, %xmm2
1791; SSE2-NEXT:    pandn %xmm0, %xmm3
1792; SSE2-NEXT:    por %xmm3, %xmm1
1793; SSE2-NEXT:    por %xmm3, %xmm2
1794; SSE2-NEXT:    paddb 16(%rdx), %xmm2
1795; SSE2-NEXT:    paddb (%rdx), %xmm1
1796; SSE2-NEXT:    movdqa %xmm1, (%rcx)
1797; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
1798; SSE2-NEXT:    retq
1799;
1800; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1801; SSE42:       # %bb.0:
1802; SSE42-NEXT:    movdqa (%rdi), %xmm0
1803; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1804; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1805; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1806; SSE42-NEXT:    paddb (%rsi), %xmm0
1807; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1808; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1809; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1810; SSE42-NEXT:    paddb 16(%rdx), %xmm0
1811; SSE42-NEXT:    paddb (%rdx), %xmm1
1812; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1813; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
1814; SSE42-NEXT:    retq
1815;
1816; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1817; AVX:       # %bb.0:
1818; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1819; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1820; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1821; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1822; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1823; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1824; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1825; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1826; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
1827; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
1828; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
1829; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
1830; AVX-NEXT:    retq
1831;
1832; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1833; AVX2:       # %bb.0:
1834; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1835; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1836; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1837; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1838; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1839; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1840; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1841; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1842; AVX2-NEXT:    vzeroupper
1843; AVX2-NEXT:    retq
1844;
1845; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1846; AVX512F:       # %bb.0:
1847; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
1848; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
1849; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1850; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1851; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1852; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1853; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1854; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
1855; AVX512F-NEXT:    vzeroupper
1856; AVX512F-NEXT:    retq
1857;
1858; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1859; AVX512DQ:       # %bb.0:
1860; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1861; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
1862; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
1863; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
1864; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1865; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1866; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1867; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
1868; AVX512DQ-NEXT:    vzeroupper
1869; AVX512DQ-NEXT:    retq
1870;
1871; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1872; AVX512BW:       # %bb.0:
1873; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
1874; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
1875; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1876; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
1877; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
1878; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
1879; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
1880; AVX512BW-NEXT:    vzeroupper
1881; AVX512BW-NEXT:    retq
1882  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1883  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1884  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1885  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1886  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1887  %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8>
1888  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1889  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1890  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1891  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1892  ret void
1893}
1894
1895define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1896; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1897; SSE2:       # %bb.0:
1898; SSE2-NEXT:    movdqa (%rdi), %xmm0
1899; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
1900; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
1901; SSE2-NEXT:    paddb 48(%rsi), %xmm2
1902; SSE2-NEXT:    paddb (%rsi), %xmm0
1903; SSE2-NEXT:    paddb 32(%rsi), %xmm1
1904; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1905; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1906; SSE2-NEXT:    movdqa %xmm0, %xmm3
1907; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1908; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
1909; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1910; SSE2-NEXT:    paddb 16(%rdx), %xmm0
1911; SSE2-NEXT:    paddb (%rdx), %xmm3
1912; SSE2-NEXT:    movdqa %xmm3, (%rcx)
1913; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
1914; SSE2-NEXT:    retq
1915;
1916; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1917; SSE42:       # %bb.0:
1918; SSE42-NEXT:    movdqa (%rdi), %xmm0
1919; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
1920; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
1921; SSE42-NEXT:    paddb 48(%rsi), %xmm2
1922; SSE42-NEXT:    paddb 32(%rsi), %xmm1
1923; SSE42-NEXT:    paddb (%rsi), %xmm0
1924; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1925; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1926; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1927; SSE42-NEXT:    paddb 16(%rdx), %xmm2
1928; SSE42-NEXT:    paddb (%rdx), %xmm1
1929; SSE42-NEXT:    movdqa %xmm1, (%rcx)
1930; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
1931; SSE42-NEXT:    retq
1932;
1933; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1934; AVX:       # %bb.0:
1935; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1936; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1937; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
1938; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
1939; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
1940; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1941; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
1942; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1943; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
1944; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1945; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1946; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
1947; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
1948; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
1949; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
1950; AVX-NEXT:    vzeroupper
1951; AVX-NEXT:    retq
1952;
1953; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1954; AVX2:       # %bb.0:
1955; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
1956; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1957; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1958; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1959; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
1960; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1961; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1962; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
1963; AVX2-NEXT:    vzeroupper
1964; AVX2-NEXT:    retq
1965;
1966; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1967; AVX512F-SLOW:       # %bb.0:
1968; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
1969; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1970; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
1971; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1972; AVX512F-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
1973; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1974; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
1975; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
1976; AVX512F-SLOW-NEXT:    vzeroupper
1977; AVX512F-SLOW-NEXT:    retq
1978;
1979; AVX512F-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1980; AVX512F-FAST:       # %bb.0:
1981; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm0
1982; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1983; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm1
1984; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1985; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
1986; AVX512F-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
1987; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
1988; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
1989; AVX512F-FAST-NEXT:    vzeroupper
1990; AVX512F-FAST-NEXT:    retq
1991;
1992; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1993; AVX512DQ-SLOW:       # %bb.0:
1994; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
1995; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
1996; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
1997; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
1998; AVX512DQ-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
1999; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
2000; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2001; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2002; AVX512DQ-SLOW-NEXT:    vzeroupper
2003; AVX512DQ-SLOW-NEXT:    retq
2004;
2005; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2006; AVX512DQ-FAST:       # %bb.0:
2007; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm0
2008; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2009; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm1
2010; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2011; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
2012; AVX512DQ-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2013; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2014; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2015; AVX512DQ-FAST-NEXT:    vzeroupper
2016; AVX512DQ-FAST-NEXT:    retq
2017;
2018; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2019; AVX512BW-SLOW:       # %bb.0:
2020; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
2021; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2022; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2023; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
2024; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2025; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2026; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2027; AVX512BW-SLOW-NEXT:    vzeroupper
2028; AVX512BW-SLOW-NEXT:    retq
2029;
2030; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2031; AVX512BW-FAST:       # %bb.0:
2032; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
2033; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
2034; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2035; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
2036; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2037; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
2038; AVX512BW-FAST-NEXT:    vzeroupper
2039; AVX512BW-FAST-NEXT:    retq
2040  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2041  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2042  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2043  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2044  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
2045  %out.bytevec = bitcast <8 x i32> %broadcast.of.aextinreg to <32 x i8>
2046  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2047  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2048  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2049  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2050  ret void
2051}
2052
2053define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2054; SSE2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2055; SSE2:       # %bb.0:
2056; SSE2-NEXT:    movdqa (%rdi), %xmm0
2057; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
2058; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
2059; SSE2-NEXT:    paddb 48(%rsi), %xmm2
2060; SSE2-NEXT:    paddb (%rsi), %xmm0
2061; SSE2-NEXT:    paddb 32(%rsi), %xmm1
2062; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2063; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2064; SSE2-NEXT:    paddb 16(%rdx), %xmm2
2065; SSE2-NEXT:    paddb (%rdx), %xmm1
2066; SSE2-NEXT:    movdqa %xmm1, (%rcx)
2067; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
2068; SSE2-NEXT:    retq
2069;
2070; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2071; SSE42:       # %bb.0:
2072; SSE42-NEXT:    movdqa (%rdi), %xmm0
2073; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
2074; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2075; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2076; SSE42-NEXT:    paddb (%rsi), %xmm0
2077; SSE42-NEXT:    paddb 32(%rsi), %xmm1
2078; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2079; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2080; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2081; SSE42-NEXT:    paddb (%rdx), %xmm1
2082; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2083; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2084; SSE42-NEXT:    retq
2085;
2086; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2087; AVX:       # %bb.0:
2088; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2089; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
2090; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
2091; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
2092; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
2093; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2094; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2095; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2096; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2097; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
2098; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
2099; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2100; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2101; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
2102; AVX-NEXT:    vzeroupper
2103; AVX-NEXT:    retq
2104;
2105; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2106; AVX2:       # %bb.0:
2107; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2108; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2109; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2110; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2111; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2112; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2113; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2114; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2115; AVX2-NEXT:    vzeroupper
2116; AVX2-NEXT:    retq
2117;
2118; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2119; AVX512F-SLOW:       # %bb.0:
2120; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
2121; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
2122; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2123; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2124; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2125; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2126; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2127; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2128; AVX512F-SLOW-NEXT:    vzeroupper
2129; AVX512F-SLOW-NEXT:    retq
2130;
2131; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2132; AVX512F-FAST:       # %bb.0:
2133; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2134; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2135; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2136; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2137; AVX512F-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2138; AVX512F-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2139; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2140; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2141; AVX512F-FAST-NEXT:    vzeroupper
2142; AVX512F-FAST-NEXT:    retq
2143;
2144; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2145; AVX512DQ-SLOW:       # %bb.0:
2146; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
2147; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
2148; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2149; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2150; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2151; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2152; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2153; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2154; AVX512DQ-SLOW-NEXT:    vzeroupper
2155; AVX512DQ-SLOW-NEXT:    retq
2156;
2157; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2158; AVX512DQ-FAST:       # %bb.0:
2159; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2160; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2161; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2162; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2163; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2164; AVX512DQ-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2165; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2166; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2167; AVX512DQ-FAST-NEXT:    vzeroupper
2168; AVX512DQ-FAST-NEXT:    retq
2169;
2170; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2171; AVX512BW-SLOW:       # %bb.0:
2172; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
2173; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2174; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2175; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2176; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2177; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2178; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2179; AVX512BW-SLOW-NEXT:    vzeroupper
2180; AVX512BW-SLOW-NEXT:    retq
2181;
2182; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2183; AVX512BW-FAST:       # %bb.0:
2184; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
2185; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
2186; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2187; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
2188; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2189; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
2190; AVX512BW-FAST-NEXT:    vzeroupper
2191; AVX512BW-FAST-NEXT:    retq
2192  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2193  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2194  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2195  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2196  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
2197  %out.bytevec = bitcast <8 x i32> %broadcast.of.aextinreg to <32 x i8>
2198  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2199  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2200  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2201  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2202  ret void
2203}
2204
2205define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2206; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2207; SSE2:       # %bb.0:
2208; SSE2-NEXT:    movdqa (%rdi), %xmm0
2209; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
2210; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
2211; SSE2-NEXT:    paddb 48(%rsi), %xmm2
2212; SSE2-NEXT:    paddb (%rsi), %xmm0
2213; SSE2-NEXT:    paddb 32(%rsi), %xmm1
2214; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2215; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2216; SSE2-NEXT:    paddb 16(%rdx), %xmm0
2217; SSE2-NEXT:    paddb (%rdx), %xmm1
2218; SSE2-NEXT:    movdqa %xmm1, (%rcx)
2219; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
2220; SSE2-NEXT:    retq
2221;
2222; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2223; SSE42:       # %bb.0:
2224; SSE42-NEXT:    movdqa (%rdi), %xmm0
2225; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
2226; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
2227; SSE42-NEXT:    paddb 48(%rsi), %xmm2
2228; SSE42-NEXT:    paddb 32(%rsi), %xmm1
2229; SSE42-NEXT:    paddb (%rsi), %xmm0
2230; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2231; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2232; SSE42-NEXT:    paddb 16(%rdx), %xmm0
2233; SSE42-NEXT:    paddb (%rdx), %xmm1
2234; SSE42-NEXT:    movdqa %xmm1, (%rcx)
2235; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
2236; SSE42-NEXT:    retq
2237;
2238; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2239; AVX:       # %bb.0:
2240; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2241; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
2242; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
2243; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
2244; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
2245; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2246; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2247; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2248; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2249; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
2250; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
2251; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
2252; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
2253; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
2254; AVX-NEXT:    vzeroupper
2255; AVX-NEXT:    retq
2256;
2257; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2258; AVX2:       # %bb.0:
2259; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
2260; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2261; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
2262; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2263; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
2264; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2265; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2266; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
2267; AVX2-NEXT:    vzeroupper
2268; AVX2-NEXT:    retq
2269;
2270; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2271; AVX512F-SLOW:       # %bb.0:
2272; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
2273; AVX512F-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2274; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
2275; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2276; AVX512F-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
2277; AVX512F-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2278; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2279; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2280; AVX512F-SLOW-NEXT:    vzeroupper
2281; AVX512F-SLOW-NEXT:    retq
2282;
2283; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2284; AVX512F-FAST:       # %bb.0:
2285; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2286; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2287; AVX512F-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2288; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2289; AVX512F-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7]
2290; AVX512F-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
2291; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2292; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2293; AVX512F-FAST-NEXT:    vzeroupper
2294; AVX512F-FAST-NEXT:    retq
2295;
2296; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2297; AVX512DQ-SLOW:       # %bb.0:
2298; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm0
2299; AVX512DQ-SLOW-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
2300; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
2301; AVX512DQ-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
2302; AVX512DQ-SLOW-NEXT:    vpbroadcastq %xmm1, %ymm1
2303; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2304; AVX512DQ-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
2305; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
2306; AVX512DQ-SLOW-NEXT:    vzeroupper
2307; AVX512DQ-SLOW-NEXT:    retq
2308;
2309; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2310; AVX512DQ-FAST:       # %bb.0:
2311; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm0
2312; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
2313; AVX512DQ-FAST-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
2314; AVX512DQ-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
2315; AVX512DQ-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7]
2316; AVX512DQ-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
2317; AVX512DQ-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
2318; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
2319; AVX512DQ-FAST-NEXT:    vzeroupper
2320; AVX512DQ-FAST-NEXT:    retq
2321;
2322; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2323; AVX512BW-SLOW:       # %bb.0:
2324; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
2325; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2326; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2327; AVX512BW-SLOW-NEXT:    vpbroadcastq %xmm0, %ymm0
2328; AVX512BW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2329; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2330; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2331; AVX512BW-SLOW-NEXT:    vzeroupper
2332; AVX512BW-SLOW-NEXT:    retq
2333;
2334; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2335; AVX512BW-FAST:       # %bb.0:
2336; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
2337; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
2338; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2339; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
2340; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2341; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
2342; AVX512BW-FAST-NEXT:    vzeroupper
2343; AVX512BW-FAST-NEXT:    retq
2344  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2345  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2346  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2347  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
2348  %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
2349  %out.bytevec = bitcast <4 x i64> %broadcast.of.aextinreg to <32 x i8>
2350  %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2351  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2352  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2353  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2354  ret void
2355}
2356
2357define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2358; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2359; SSE2:       # %bb.0:
2360; SSE2-NEXT:    movdqa (%rdi), %xmm0
2361; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2362; SSE2-NEXT:    paddb (%rsi), %xmm0
2363; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2364; SSE2-NEXT:    psrlw $8, %xmm1
2365; SSE2-NEXT:    packuswb %xmm1, %xmm1
2366; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2367; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2368; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2369; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2370; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
2371; SSE2-NEXT:    paddb (%rdx), %xmm0
2372; SSE2-NEXT:    movdqa 16(%rdx), %xmm2
2373; SSE2-NEXT:    paddb %xmm1, %xmm2
2374; SSE2-NEXT:    paddb 32(%rdx), %xmm1
2375; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
2376; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
2377; SSE2-NEXT:    movdqa %xmm0, (%rcx)
2378; SSE2-NEXT:    retq
2379;
2380; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2381; SSE42:       # %bb.0:
2382; SSE42-NEXT:    movdqa (%rdi), %xmm0
2383; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
2384; SSE42-NEXT:    paddb (%rsi), %xmm0
2385; SSE42-NEXT:    paddb 48(%rsi), %xmm1
2386; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2387; SSE42-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2388; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2389; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2390; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2391; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
2392; SSE42-NEXT:    paddb (%rdx), %xmm0
2393; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
2394; SSE42-NEXT:    paddb %xmm1, %xmm2
2395; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2396; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2397; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
2398; SSE42-NEXT:    movdqa %xmm0, (%rcx)
2399; SSE42-NEXT:    retq
2400;
2401; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2402; AVX:       # %bb.0:
2403; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2404; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2405; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2406; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2407; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2408; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2409; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
2410; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2411; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2412; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2413; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2414; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2415; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2416; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2417; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2418; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2419; AVX-NEXT:    retq
2420;
2421; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2422; AVX2:       # %bb.0:
2423; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2424; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
2425; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2426; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2427; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2428; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
2429; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2430; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2431; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2432; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2433; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2434; AVX2-NEXT:    vzeroupper
2435; AVX2-NEXT:    retq
2436;
2437; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2438; AVX512F:       # %bb.0:
2439; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2440; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2441; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2442; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2443; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2444; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
2445; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2446; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2447; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2448; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2449; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2450; AVX512F-NEXT:    vzeroupper
2451; AVX512F-NEXT:    retq
2452;
2453; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2454; AVX512DQ:       # %bb.0:
2455; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2456; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2457; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2458; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2459; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2460; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
2461; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2462; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2463; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2464; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2465; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2466; AVX512DQ-NEXT:    vzeroupper
2467; AVX512DQ-NEXT:    retq
2468;
2469; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2470; AVX512BW:       # %bb.0:
2471; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2472; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2473; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2474; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2475; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
2476; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2477; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2478; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2479; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2480; AVX512BW-NEXT:    vzeroupper
2481; AVX512BW-NEXT:    retq
2482  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2483  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2484  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2485  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63, i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95>
2486  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2487  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2488  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2489  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2490  ret void
2491}
2492
2493define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2494; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2495; SSE2:       # %bb.0:
2496; SSE2-NEXT:    movdqa (%rdi), %xmm0
2497; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2498; SSE2-NEXT:    paddb (%rsi), %xmm0
2499; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2500; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2501; SSE2-NEXT:    pand %xmm2, %xmm1
2502; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2503; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2504; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2505; SSE2-NEXT:    pandn %xmm0, %xmm2
2506; SSE2-NEXT:    por %xmm1, %xmm2
2507; SSE2-NEXT:    paddb (%rdx), %xmm2
2508; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2509; SSE2-NEXT:    paddb %xmm0, %xmm1
2510; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2511; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2512; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2513; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2514; SSE2-NEXT:    retq
2515;
2516; SSE42-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2517; SSE42:       # %bb.0:
2518; SSE42-NEXT:    movdqa (%rdi), %xmm0
2519; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
2520; SSE42-NEXT:    paddb 48(%rsi), %xmm1
2521; SSE42-NEXT:    paddb (%rsi), %xmm0
2522; SSE42-NEXT:    movdqa %xmm0, %xmm2
2523; SSE42-NEXT:    palignr {{.*#+}} xmm2 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
2524; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2525; SSE42-NEXT:    pxor %xmm1, %xmm1
2526; SSE42-NEXT:    pshufb %xmm1, %xmm0
2527; SSE42-NEXT:    paddb (%rdx), %xmm2
2528; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
2529; SSE42-NEXT:    paddb %xmm0, %xmm1
2530; SSE42-NEXT:    paddb 32(%rdx), %xmm0
2531; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
2532; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
2533; SSE42-NEXT:    movdqa %xmm2, (%rcx)
2534; SSE42-NEXT:    retq
2535;
2536; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2537; AVX:       # %bb.0:
2538; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2539; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2540; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2541; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2542; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2543; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2544; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2545; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2546; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2547; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2548; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2549; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2550; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2551; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2552; AVX-NEXT:    retq
2553;
2554; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2555; AVX2:       # %bb.0:
2556; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2557; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
2558; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2559; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2560; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2561; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2562; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2563; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2564; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2565; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2566; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2567; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2568; AVX2-NEXT:    vzeroupper
2569; AVX2-NEXT:    retq
2570;
2571; AVX512F-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2572; AVX512F:       # %bb.0:
2573; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2574; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2575; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2576; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2577; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2578; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2579; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
2580; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2581; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2582; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2583; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2584; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2585; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2586; AVX512F-NEXT:    vzeroupper
2587; AVX512F-NEXT:    retq
2588;
2589; AVX512DQ-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2590; AVX512DQ:       # %bb.0:
2591; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2592; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2593; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2594; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2595; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2596; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2597; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
2598; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2599; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2600; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2601; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2602; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2603; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2604; AVX512DQ-NEXT:    vzeroupper
2605; AVX512DQ-NEXT:    retq
2606;
2607; AVX512BW-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2608; AVX512BW:       # %bb.0:
2609; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2610; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2611; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2612; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2613; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2614; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm0
2615; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2616; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2617; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2618; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2619; AVX512BW-NEXT:    vzeroupper
2620; AVX512BW-NEXT:    retq
2621  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2622  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2623  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2624  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 0, i32 52, i32 53, i32 0, i32 55, i32 56, i32 0, i32 58, i32 59, i32 0, i32 61, i32 62, i32 0, i32 64, i32 65, i32 0, i32 67, i32 68, i32 0, i32 70, i32 71, i32 0, i32 73, i32 74, i32 0, i32 76, i32 77, i32 0, i32 79, i32 80, i32 0, i32 82, i32 83, i32 0, i32 85, i32 86, i32 0, i32 88, i32 89, i32 0, i32 91, i32 92, i32 0, i32 94, i32 95>
2625  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2626  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2627  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2628  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2629  ret void
2630}
2631
2632define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2633; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2634; SSE2:       # %bb.0:
2635; SSE2-NEXT:    movdqa (%rdi), %xmm0
2636; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2637; SSE2-NEXT:    paddb (%rsi), %xmm0
2638; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2639; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2640; SSE2-NEXT:    pand %xmm2, %xmm1
2641; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2642; SSE2-NEXT:    pandn %xmm0, %xmm2
2643; SSE2-NEXT:    por %xmm1, %xmm2
2644; SSE2-NEXT:    paddb (%rdx), %xmm2
2645; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2646; SSE2-NEXT:    paddb %xmm0, %xmm1
2647; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2648; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2649; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2650; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2651; SSE2-NEXT:    retq
2652;
2653; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2654; SSE42:       # %bb.0:
2655; SSE42-NEXT:    movdqa (%rdi), %xmm0
2656; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
2657; SSE42-NEXT:    paddb 48(%rsi), %xmm1
2658; SSE42-NEXT:    paddb (%rsi), %xmm0
2659; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
2660; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2661; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
2662; SSE42-NEXT:    paddb (%rdx), %xmm0
2663; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
2664; SSE42-NEXT:    paddb %xmm2, %xmm1
2665; SSE42-NEXT:    paddb 32(%rdx), %xmm2
2666; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
2667; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
2668; SSE42-NEXT:    movdqa %xmm0, (%rcx)
2669; SSE42-NEXT:    retq
2670;
2671; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2672; AVX:       # %bb.0:
2673; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2674; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2675; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2676; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2677; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2678; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
2679; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2680; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2681; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2682; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2683; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2684; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2685; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2686; AVX-NEXT:    retq
2687;
2688; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2689; AVX2:       # %bb.0:
2690; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2691; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
2692; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2693; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2694; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm2
2695; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2696; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2697; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2698; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2699; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2700; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2701; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2702; AVX2-NEXT:    vzeroupper
2703; AVX2-NEXT:    retq
2704;
2705; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2706; AVX512F:       # %bb.0:
2707; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2708; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2709; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2710; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2711; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm2
2712; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2713; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
2714; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
2715; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2716; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2717; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2718; AVX512F-NEXT:    vzeroupper
2719; AVX512F-NEXT:    retq
2720;
2721; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2722; AVX512DQ:       # %bb.0:
2723; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2724; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2725; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2726; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2727; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm2
2728; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2729; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
2730; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
2731; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2732; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2733; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2734; AVX512DQ-NEXT:    vzeroupper
2735; AVX512DQ-NEXT:    retq
2736;
2737; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2738; AVX512BW:       # %bb.0:
2739; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2740; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2741; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2742; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm2
2743; AVX512BW-NEXT:    movl $286331153, %eax # imm = 0x11111111
2744; AVX512BW-NEXT:    kmovd %eax, %k1
2745; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
2746; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
2747; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2748; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2749; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2750; AVX512BW-NEXT:    vzeroupper
2751; AVX512BW-NEXT:    retq
2752  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2753  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2754  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2755  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95>
2756  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2757  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2758  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2759  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2760  ret void
2761}
2762
2763define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2764; SSE2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2765; SSE2:       # %bb.0:
2766; SSE2-NEXT:    movdqa (%rdi), %xmm0
2767; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2768; SSE2-NEXT:    paddb (%rsi), %xmm0
2769; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2770; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2771; SSE2-NEXT:    pand %xmm2, %xmm1
2772; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2773; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2774; SSE2-NEXT:    pandn %xmm0, %xmm2
2775; SSE2-NEXT:    por %xmm1, %xmm2
2776; SSE2-NEXT:    paddb (%rdx), %xmm2
2777; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2778; SSE2-NEXT:    paddb %xmm0, %xmm1
2779; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2780; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2781; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2782; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2783; SSE2-NEXT:    retq
2784;
2785; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2786; SSE42:       # %bb.0:
2787; SSE42-NEXT:    movdqa (%rdi), %xmm0
2788; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
2789; SSE42-NEXT:    paddb 48(%rsi), %xmm1
2790; SSE42-NEXT:    paddb (%rsi), %xmm0
2791; SSE42-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2792; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2793; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2794; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
2795; SSE42-NEXT:    paddb (%rdx), %xmm0
2796; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
2797; SSE42-NEXT:    paddb %xmm1, %xmm2
2798; SSE42-NEXT:    paddb 32(%rdx), %xmm1
2799; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
2800; SSE42-NEXT:    movdqa %xmm0, (%rcx)
2801; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
2802; SSE42-NEXT:    retq
2803;
2804; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2805; AVX:       # %bb.0:
2806; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2807; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2808; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2809; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2810; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2811; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2812; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2813; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2814; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2815; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2816; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2817; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2818; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2819; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2820; AVX-NEXT:    retq
2821;
2822; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2823; AVX2:       # %bb.0:
2824; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2825; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
2826; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2827; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2828; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2829; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2830; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2831; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2832; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2833; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2834; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2835; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2836; AVX2-NEXT:    vzeroupper
2837; AVX2-NEXT:    retq
2838;
2839; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2840; AVX512F:       # %bb.0:
2841; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2842; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2843; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2844; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2845; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2846; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2847; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
2848; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2849; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2850; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2851; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2852; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2853; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2854; AVX512F-NEXT:    vzeroupper
2855; AVX512F-NEXT:    retq
2856;
2857; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2858; AVX512DQ:       # %bb.0:
2859; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2860; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2861; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2862; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2863; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2864; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2865; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
2866; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2867; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2868; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2869; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2870; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
2871; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
2872; AVX512DQ-NEXT:    vzeroupper
2873; AVX512DQ-NEXT:    retq
2874;
2875; AVX512BW-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2876; AVX512BW:       # %bb.0:
2877; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
2878; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
2879; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
2880; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2881; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2882; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
2883; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
2884; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2885; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
2886; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
2887; AVX512BW-NEXT:    vzeroupper
2888; AVX512BW-NEXT:    retq
2889  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2890  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2891  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2892  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 0, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 0, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 0, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 0, i32 91, i32 92, i32 93, i32 94, i32 95>
2893  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2894  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2895  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2896  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2897  ret void
2898}
2899
2900define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2901; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2902; SSE2:       # %bb.0:
2903; SSE2-NEXT:    movdqa (%rdi), %xmm0
2904; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
2905; SSE2-NEXT:    paddb (%rsi), %xmm0
2906; SSE2-NEXT:    paddb 48(%rsi), %xmm1
2907; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2908; SSE2-NEXT:    pand %xmm2, %xmm1
2909; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2910; SSE2-NEXT:    pandn %xmm0, %xmm2
2911; SSE2-NEXT:    por %xmm1, %xmm2
2912; SSE2-NEXT:    paddb (%rdx), %xmm2
2913; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
2914; SSE2-NEXT:    paddb %xmm0, %xmm1
2915; SSE2-NEXT:    paddb 32(%rdx), %xmm0
2916; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
2917; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
2918; SSE2-NEXT:    movdqa %xmm2, (%rcx)
2919; SSE2-NEXT:    retq
2920;
2921; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2922; SSE42:       # %bb.0:
2923; SSE42-NEXT:    movdqa (%rdi), %xmm0
2924; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
2925; SSE42-NEXT:    paddb 48(%rsi), %xmm1
2926; SSE42-NEXT:    paddb (%rsi), %xmm0
2927; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
2928; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2929; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
2930; SSE42-NEXT:    paddb (%rdx), %xmm0
2931; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
2932; SSE42-NEXT:    paddb %xmm2, %xmm1
2933; SSE42-NEXT:    paddb 32(%rdx), %xmm2
2934; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
2935; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
2936; SSE42-NEXT:    movdqa %xmm0, (%rcx)
2937; SSE42-NEXT:    retq
2938;
2939; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2940; AVX:       # %bb.0:
2941; AVX-NEXT:    vmovdqa (%rdi), %xmm0
2942; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
2943; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2944; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2945; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2946; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
2947; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2948; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
2949; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
2950; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
2951; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
2952; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
2953; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
2954; AVX-NEXT:    retq
2955;
2956; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2957; AVX2:       # %bb.0:
2958; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2959; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
2960; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2961; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2962; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm2
2963; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2964; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2965; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2966; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
2967; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2968; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
2969; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
2970; AVX2-NEXT:    vzeroupper
2971; AVX2-NEXT:    retq
2972;
2973; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2974; AVX512F:       # %bb.0:
2975; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
2976; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
2977; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2978; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2979; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm2
2980; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2981; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
2982; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
2983; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
2984; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
2985; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
2986; AVX512F-NEXT:    vzeroupper
2987; AVX512F-NEXT:    retq
2988;
2989; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2990; AVX512DQ:       # %bb.0:
2991; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
2992; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
2993; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
2994; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
2995; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm2
2996; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
2997; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
2998; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
2999; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3000; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3001; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3002; AVX512DQ-NEXT:    vzeroupper
3003; AVX512DQ-NEXT:    retq
3004;
3005; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3006; AVX512BW:       # %bb.0:
3007; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3008; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3009; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3010; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm2
3011; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
3012; AVX512BW-NEXT:    kmovd %eax, %k1
3013; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
3014; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
3015; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3016; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3017; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3018; AVX512BW-NEXT:    vzeroupper
3019; AVX512BW-NEXT:    retq
3020  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3021  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3022  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3023  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3024  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3025  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3026  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3027  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3028  ret void
3029}
3030
3031define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3032; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3033; SSE2:       # %bb.0:
3034; SSE2-NEXT:    movdqa (%rdi), %xmm0
3035; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3036; SSE2-NEXT:    paddb (%rsi), %xmm0
3037; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3038; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3039; SSE2-NEXT:    pand %xmm2, %xmm1
3040; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3041; SSE2-NEXT:    pandn %xmm3, %xmm2
3042; SSE2-NEXT:    por %xmm1, %xmm2
3043; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3044; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3045; SSE2-NEXT:    paddb (%rdx), %xmm2
3046; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3047; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3048; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3049; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3050; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3051; SSE2-NEXT:    retq
3052;
3053; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3054; SSE42:       # %bb.0:
3055; SSE42-NEXT:    movdqa (%rdi), %xmm0
3056; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3057; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3058; SSE42-NEXT:    paddb (%rsi), %xmm0
3059; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3060; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
3061; SSE42-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3062; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3063; SSE42-NEXT:    paddb (%rdx), %xmm0
3064; SSE42-NEXT:    paddb 16(%rdx), %xmm3
3065; SSE42-NEXT:    paddb 32(%rdx), %xmm2
3066; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
3067; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
3068; SSE42-NEXT:    movdqa %xmm0, (%rcx)
3069; SSE42-NEXT:    retq
3070;
3071; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3072; AVX:       # %bb.0:
3073; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3074; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3075; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3076; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3077; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3078; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3079; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
3080; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3081; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3082; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
3083; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3084; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3085; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
3086; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3087; AVX-NEXT:    retq
3088;
3089; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3090; AVX2:       # %bb.0:
3091; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3092; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3093; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3094; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3095; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3096; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3097; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
3098; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3099; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3100; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3101; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3102; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3103; AVX2-NEXT:    vzeroupper
3104; AVX2-NEXT:    retq
3105;
3106; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3107; AVX512F:       # %bb.0:
3108; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3109; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3110; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3111; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3112; AVX512F-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3113; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3114; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
3115; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3116; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3117; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3118; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3119; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3120; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3121; AVX512F-NEXT:    vzeroupper
3122; AVX512F-NEXT:    retq
3123;
3124; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3125; AVX512DQ:       # %bb.0:
3126; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3127; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3128; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3129; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3130; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3131; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3132; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
3133; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3134; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3135; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3136; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3137; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3138; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3139; AVX512DQ-NEXT:    vzeroupper
3140; AVX512DQ-NEXT:    retq
3141;
3142; AVX512BW-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3143; AVX512BW:       # %bb.0:
3144; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3145; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3146; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3147; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3148; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3149; AVX512BW-NEXT:    vpbroadcastb %xmm0, %ymm0
3150; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3151; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3152; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3153; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3154; AVX512BW-NEXT:    vzeroupper
3155; AVX512BW-NEXT:    retq
3156  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3157  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3158  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3159  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3160  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3161  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3162  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3163  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3164  ret void
3165}
3166
3167define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3168; SSE2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3169; SSE2:       # %bb.0:
3170; SSE2-NEXT:    movdqa (%rdi), %xmm0
3171; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
3172; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
3173; SSE2-NEXT:    paddb 16(%rsi), %xmm1
3174; SSE2-NEXT:    paddb 48(%rsi), %xmm2
3175; SSE2-NEXT:    paddb (%rsi), %xmm0
3176; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3177; SSE2-NEXT:    pand %xmm3, %xmm2
3178; SSE2-NEXT:    pandn %xmm0, %xmm3
3179; SSE2-NEXT:    por %xmm2, %xmm3
3180; SSE2-NEXT:    paddb (%rdx), %xmm3
3181; SSE2-NEXT:    movdqa 16(%rdx), %xmm2
3182; SSE2-NEXT:    paddb %xmm0, %xmm2
3183; SSE2-NEXT:    paddb 48(%rdx), %xmm1
3184; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3185; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3186; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
3187; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
3188; SSE2-NEXT:    movdqa %xmm3, (%rcx)
3189; SSE2-NEXT:    retq
3190;
3191; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3192; SSE42:       # %bb.0:
3193; SSE42-NEXT:    movdqa (%rdi), %xmm1
3194; SSE42-NEXT:    movdqa 16(%rdi), %xmm2
3195; SSE42-NEXT:    movdqa 48(%rdi), %xmm3
3196; SSE42-NEXT:    paddb 16(%rsi), %xmm2
3197; SSE42-NEXT:    paddb 48(%rsi), %xmm3
3198; SSE42-NEXT:    paddb (%rsi), %xmm1
3199; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3200; SSE42-NEXT:    movdqa %xmm1, %xmm4
3201; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
3202; SSE42-NEXT:    paddb (%rdx), %xmm4
3203; SSE42-NEXT:    movdqa 16(%rdx), %xmm0
3204; SSE42-NEXT:    paddb %xmm1, %xmm0
3205; SSE42-NEXT:    paddb 48(%rdx), %xmm2
3206; SSE42-NEXT:    paddb 32(%rdx), %xmm1
3207; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
3208; SSE42-NEXT:    movdqa %xmm2, 48(%rcx)
3209; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3210; SSE42-NEXT:    movdqa %xmm4, (%rcx)
3211; SSE42-NEXT:    retq
3212;
3213; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3214; AVX:       # %bb.0:
3215; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3216; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
3217; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
3218; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
3219; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
3220; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3221; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615]
3222; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm2
3223; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
3224; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
3225; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
3226; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3227; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3228; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
3229; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
3230; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
3231; AVX-NEXT:    retq
3232;
3233; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3234; AVX2:       # %bb.0:
3235; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3236; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3237; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3238; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3239; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3240; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0]
3241; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3242; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3243; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3244; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3245; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3246; AVX2-NEXT:    vzeroupper
3247; AVX2-NEXT:    retq
3248;
3249; AVX512F-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3250; AVX512F:       # %bb.0:
3251; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
3252; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3253; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3254; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3255; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3256; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3257; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
3258; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3259; AVX512F-NEXT:    vpaddb (%rdx), %ymm3, %ymm1
3260; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3261; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3262; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3263; AVX512F-NEXT:    vzeroupper
3264; AVX512F-NEXT:    retq
3265;
3266; AVX512DQ-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3267; AVX512DQ:       # %bb.0:
3268; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
3269; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
3270; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3271; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3272; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3273; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3274; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
3275; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3276; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm3, %ymm1
3277; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3278; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3279; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3280; AVX512DQ-NEXT:    vzeroupper
3281; AVX512DQ-NEXT:    retq
3282;
3283; AVX512BW-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3284; AVX512BW:       # %bb.0:
3285; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3286; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3287; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3288; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3289; AVX512BW-NEXT:    movl $65537, %eax # imm = 0x10001
3290; AVX512BW-NEXT:    kmovd %eax, %k1
3291; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
3292; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3293; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3294; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3295; AVX512BW-NEXT:    vzeroupper
3296; AVX512BW-NEXT:    retq
3297  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3298  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3299  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3300  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3301  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3302  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3303  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3304  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3305  ret void
3306}
3307
3308define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3309; SSE2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3310; SSE2:       # %bb.0:
3311; SSE2-NEXT:    movdqa (%rdi), %xmm0
3312; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3313; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3314; SSE2-NEXT:    paddb (%rsi), %xmm0
3315; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3316; SSE2-NEXT:    pand %xmm2, %xmm1
3317; SSE2-NEXT:    pandn %xmm0, %xmm2
3318; SSE2-NEXT:    por %xmm1, %xmm2
3319; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3320; SSE2-NEXT:    paddb (%rdx), %xmm2
3321; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3322; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3323; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3324; SSE2-NEXT:    retq
3325;
3326; SSE42-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3327; SSE42:       # %bb.0:
3328; SSE42-NEXT:    movdqa (%rdi), %xmm1
3329; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
3330; SSE42-NEXT:    paddb 48(%rsi), %xmm2
3331; SSE42-NEXT:    paddb (%rsi), %xmm1
3332; SSE42-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3333; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
3334; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
3335; SSE42-NEXT:    paddb (%rdx), %xmm1
3336; SSE42-NEXT:    paddb 16(%rdx), %xmm3
3337; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3338; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
3339; SSE42-NEXT:    retq
3340;
3341; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3342; AVX:       # %bb.0:
3343; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3344; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3345; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3346; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3347; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
3348; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3349; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3350; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3351; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3352; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3353; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3354; AVX-NEXT:    retq
3355;
3356; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3357; AVX2:       # %bb.0:
3358; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3359; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3360; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3361; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3362; AVX2-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
3363; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3364; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
3365; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3366; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3367; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3368; AVX2-NEXT:    vzeroupper
3369; AVX2-NEXT:    retq
3370;
3371; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3372; AVX512F:       # %bb.0:
3373; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3374; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3375; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3376; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3377; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0))
3378; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
3379; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3380; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3381; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
3382; AVX512F-NEXT:    vzeroupper
3383; AVX512F-NEXT:    retq
3384;
3385; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3386; AVX512DQ:       # %bb.0:
3387; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3388; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3389; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3390; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3391; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm0 ^ (mem & (xmm1 ^ xmm0))
3392; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %xmm0
3393; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3394; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3395; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
3396; AVX512DQ-NEXT:    vzeroupper
3397; AVX512DQ-NEXT:    retq
3398;
3399; AVX512BW-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3400; AVX512BW:       # %bb.0:
3401; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3402; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3403; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
3404; AVX512BW-NEXT:    movw $1, %ax
3405; AVX512BW-NEXT:    kmovd %eax, %k1
3406; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
3407; AVX512BW-NEXT:    vpbroadcastb %xmm0, %xmm0
3408; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3409; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3410; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3411; AVX512BW-NEXT:    vzeroupper
3412; AVX512BW-NEXT:    retq
3413  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3414  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3415  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3416  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3417  %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3418  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3419  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3420  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3421  ret void
3422}
3423
3424define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3425; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3426; SSE2:       # %bb.0:
3427; SSE2-NEXT:    movdqa (%rdi), %xmm0
3428; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3429; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3430; SSE2-NEXT:    paddb (%rsi), %xmm0
3431; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
3432; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
3433; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
3434; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3435; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
3436; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3437; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3438; SSE2-NEXT:    paddb (%rdx), %xmm2
3439; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3440; SSE2-NEXT:    paddb %xmm0, %xmm1
3441; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3442; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3443; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3444; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3445; SSE2-NEXT:    retq
3446;
3447; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3448; SSE42:       # %bb.0:
3449; SSE42-NEXT:    movdqa (%rdi), %xmm0
3450; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3451; SSE42-NEXT:    paddb (%rsi), %xmm0
3452; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3453; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3454; SSE42-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
3455; SSE42-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3456; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3457; SSE42-NEXT:    paddb (%rdx), %xmm2
3458; SSE42-NEXT:    movdqa 16(%rdx), %xmm1
3459; SSE42-NEXT:    paddb %xmm0, %xmm1
3460; SSE42-NEXT:    paddb 32(%rdx), %xmm0
3461; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
3462; SSE42-NEXT:    movdqa %xmm1, 16(%rcx)
3463; SSE42-NEXT:    movdqa %xmm2, (%rcx)
3464; SSE42-NEXT:    retq
3465;
3466; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3467; AVX:       # %bb.0:
3468; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3469; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3470; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3471; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3472; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
3473; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3474; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3475; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3476; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
3477; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm1
3478; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
3479; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
3480; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
3481; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
3482; AVX-NEXT:    retq
3483;
3484; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3485; AVX2:       # %bb.0:
3486; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3487; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3488; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3489; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3490; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
3491; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3492; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3493; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3494; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3495; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3496; AVX2-NEXT:    vzeroupper
3497; AVX2-NEXT:    retq
3498;
3499; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3500; AVX512F:       # %bb.0:
3501; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3502; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3503; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3504; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3505; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
3506; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3507; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3508; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3509; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3510; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3511; AVX512F-NEXT:    vzeroupper
3512; AVX512F-NEXT:    retq
3513;
3514; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3515; AVX512DQ:       # %bb.0:
3516; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3517; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3518; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3519; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3520; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %ymm0
3521; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3522; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3523; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3524; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3525; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3526; AVX512DQ-NEXT:    vzeroupper
3527; AVX512DQ-NEXT:    retq
3528;
3529; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3530; AVX512BW:       # %bb.0:
3531; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3532; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3533; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31]
3534; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
3535; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
3536; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3537; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3538; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3539; AVX512BW-NEXT:    vzeroupper
3540; AVX512BW-NEXT:    retq
3541  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3542  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3543  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3544  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3545  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31, i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47>
3546  %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3547  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3548  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3549  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3550  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3551  ret void
3552}
3553
3554define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3555; SSE2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3556; SSE2:       # %bb.0:
3557; SSE2-NEXT:    movdqa (%rdi), %xmm0
3558; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3559; SSE2-NEXT:    paddb (%rsi), %xmm0
3560; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3561; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
3562; SSE2-NEXT:    pand %xmm2, %xmm1
3563; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3564; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3565; SSE2-NEXT:    pandn %xmm0, %xmm2
3566; SSE2-NEXT:    por %xmm1, %xmm2
3567; SSE2-NEXT:    paddb (%rdx), %xmm2
3568; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3569; SSE2-NEXT:    paddb %xmm0, %xmm1
3570; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3571; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3572; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3573; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3574; SSE2-NEXT:    retq
3575;
3576; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3577; SSE42:       # %bb.0:
3578; SSE42-NEXT:    movdqa (%rdi), %xmm0
3579; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3580; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3581; SSE42-NEXT:    paddb (%rsi), %xmm0
3582; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3583; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3584; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3585; SSE42-NEXT:    paddb (%rdx), %xmm1
3586; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
3587; SSE42-NEXT:    paddb %xmm0, %xmm2
3588; SSE42-NEXT:    paddb 32(%rdx), %xmm0
3589; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
3590; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
3591; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3592; SSE42-NEXT:    retq
3593;
3594; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3595; AVX:       # %bb.0:
3596; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3597; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3598; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3599; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3600; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3601; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3602; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3603; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3604; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
3605; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3606; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3607; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3608; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3609; AVX-NEXT:    retq
3610;
3611; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3612; AVX2:       # %bb.0:
3613; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3614; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3615; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3616; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
3617; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3618; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3619; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3620; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3621; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3622; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3623; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3624; AVX2-NEXT:    vzeroupper
3625; AVX2-NEXT:    retq
3626;
3627; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3628; AVX512F:       # %bb.0:
3629; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3630; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3631; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3632; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
3633; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3634; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3635; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3636; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3637; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3638; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3639; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3640; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3641; AVX512F-NEXT:    vzeroupper
3642; AVX512F-NEXT:    retq
3643;
3644; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3645; AVX512DQ:       # %bb.0:
3646; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3647; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3648; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3649; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %ymm0
3650; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3651; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3652; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3653; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3654; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3655; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3656; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3657; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3658; AVX512DQ-NEXT:    vzeroupper
3659; AVX512DQ-NEXT:    retq
3660;
3661; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3662; AVX512BW:       # %bb.0:
3663; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3664; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3665; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31]
3666; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
3667; AVX512BW-NEXT:    vpbroadcastw %xmm0, %xmm0
3668; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3669; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3670; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3671; AVX512BW-NEXT:    vzeroupper
3672; AVX512BW-NEXT:    retq
3673  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3674  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3675  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3676  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3677  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 0, i32 28, i32 29, i32 0, i32 31, i32 32, i32 0, i32 34, i32 35, i32 0, i32 37, i32 38, i32 0, i32 40, i32 41, i32 0, i32 43, i32 44, i32 0, i32 46, i32 47>
3678  %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3679  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3680  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3681  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3682  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3683  ret void
3684}
3685
3686define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3687; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3688; SSE2:       # %bb.0:
3689; SSE2-NEXT:    movdqa (%rdi), %xmm0
3690; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3691; SSE2-NEXT:    paddb (%rsi), %xmm0
3692; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3693; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3694; SSE2-NEXT:    pand %xmm2, %xmm1
3695; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3696; SSE2-NEXT:    pandn %xmm0, %xmm2
3697; SSE2-NEXT:    por %xmm1, %xmm2
3698; SSE2-NEXT:    paddb (%rdx), %xmm2
3699; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3700; SSE2-NEXT:    paddb %xmm0, %xmm1
3701; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3702; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3703; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3704; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3705; SSE2-NEXT:    retq
3706;
3707; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3708; SSE42:       # %bb.0:
3709; SSE42-NEXT:    movdqa (%rdi), %xmm0
3710; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3711; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3712; SSE42-NEXT:    paddb (%rsi), %xmm0
3713; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3714; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3715; SSE42-NEXT:    paddb (%rdx), %xmm1
3716; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
3717; SSE42-NEXT:    paddb %xmm0, %xmm2
3718; SSE42-NEXT:    paddb 32(%rdx), %xmm0
3719; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
3720; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
3721; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3722; SSE42-NEXT:    retq
3723;
3724; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3725; AVX:       # %bb.0:
3726; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3727; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3728; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3729; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3730; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3731; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3732; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3733; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
3734; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3735; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3736; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3737; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3738; AVX-NEXT:    retq
3739;
3740; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3741; AVX2:       # %bb.0:
3742; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3743; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3744; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3745; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm2
3746; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3747; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
3748; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3749; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
3750; AVX2-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
3751; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
3752; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
3753; AVX2-NEXT:    vzeroupper
3754; AVX2-NEXT:    retq
3755;
3756; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3757; AVX512F:       # %bb.0:
3758; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3759; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3760; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3761; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3762; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm2
3763; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3764; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
3765; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3766; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3767; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3768; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3769; AVX512F-NEXT:    vzeroupper
3770; AVX512F-NEXT:    retq
3771;
3772; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3773; AVX512DQ:       # %bb.0:
3774; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3775; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3776; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3777; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3778; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm2
3779; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3780; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %ymm0
3781; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3782; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3783; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3784; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3785; AVX512DQ-NEXT:    vzeroupper
3786; AVX512DQ-NEXT:    retq
3787;
3788; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3789; AVX512BW:       # %bb.0:
3790; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3791; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3792; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3793; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15]
3794; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
3795; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3796; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
3797; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3798; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3799; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3800; AVX512BW-NEXT:    vzeroupper
3801; AVX512BW-NEXT:    retq
3802  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3803  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3804  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3805  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3806  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47>
3807  %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3808  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3809  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3810  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3811  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3812  ret void
3813}
3814
3815define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3816; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3817; SSE2:       # %bb.0:
3818; SSE2-NEXT:    movdqa (%rdi), %xmm0
3819; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3820; SSE2-NEXT:    paddb (%rsi), %xmm0
3821; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3822; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
3823; SSE2-NEXT:    pand %xmm2, %xmm1
3824; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3825; SSE2-NEXT:    pandn %xmm3, %xmm2
3826; SSE2-NEXT:    por %xmm1, %xmm2
3827; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3828; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3829; SSE2-NEXT:    paddb (%rdx), %xmm2
3830; SSE2-NEXT:    paddb 16(%rdx), %xmm0
3831; SSE2-NEXT:    paddb 32(%rdx), %xmm1
3832; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
3833; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
3834; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3835; SSE2-NEXT:    retq
3836;
3837; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3838; SSE42:       # %bb.0:
3839; SSE42-NEXT:    movdqa (%rdi), %xmm0
3840; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3841; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3842; SSE42-NEXT:    paddb (%rsi), %xmm0
3843; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
3844; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
3845; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3846; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3847; SSE42-NEXT:    paddb (%rdx), %xmm2
3848; SSE42-NEXT:    paddb 16(%rdx), %xmm0
3849; SSE42-NEXT:    paddb 32(%rdx), %xmm1
3850; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
3851; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
3852; SSE42-NEXT:    movdqa %xmm2, (%rcx)
3853; SSE42-NEXT:    retq
3854;
3855; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3856; AVX:       # %bb.0:
3857; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3858; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
3859; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3860; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3861; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3862; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3863; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7]
3864; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3865; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
3866; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
3867; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3868; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3869; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
3870; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
3871; AVX-NEXT:    retq
3872;
3873; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3874; AVX2:       # %bb.0:
3875; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
3876; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
3877; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3878; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
3879; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3880; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
3881; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3882; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3883; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3884; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
3885; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
3886; AVX2-NEXT:    vzeroupper
3887; AVX2-NEXT:    retq
3888;
3889; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3890; AVX512F:       # %bb.0:
3891; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
3892; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
3893; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3894; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
3895; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3896; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3897; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
3898; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3899; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3900; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3901; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
3902; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
3903; AVX512F-NEXT:    vzeroupper
3904; AVX512F-NEXT:    retq
3905;
3906; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3907; AVX512DQ:       # %bb.0:
3908; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
3909; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
3910; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3911; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %ymm0
3912; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3913; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
3914; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
3915; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
3916; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
3917; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
3918; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
3919; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
3920; AVX512DQ-NEXT:    vzeroupper
3921; AVX512DQ-NEXT:    retq
3922;
3923; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3924; AVX512BW:       # %bb.0:
3925; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
3926; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
3927; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
3928; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0]
3929; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
3930; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
3931; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3932; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
3933; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
3934; AVX512BW-NEXT:    vzeroupper
3935; AVX512BW-NEXT:    retq
3936  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3937  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3938  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3939  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3940  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 0, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 0, i32 43, i32 44, i32 45, i32 46, i32 47>
3941  %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3942  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3943  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3944  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3945  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3946  ret void
3947}
3948
3949define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3950; SSE2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3951; SSE2:       # %bb.0:
3952; SSE2-NEXT:    movdqa (%rdi), %xmm0
3953; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
3954; SSE2-NEXT:    paddb 48(%rsi), %xmm1
3955; SSE2-NEXT:    paddb (%rsi), %xmm0
3956; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
3957; SSE2-NEXT:    pand %xmm2, %xmm1
3958; SSE2-NEXT:    pandn %xmm0, %xmm2
3959; SSE2-NEXT:    por %xmm1, %xmm2
3960; SSE2-NEXT:    paddb (%rdx), %xmm2
3961; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
3962; SSE2-NEXT:    paddb %xmm0, %xmm1
3963; SSE2-NEXT:    paddb 32(%rdx), %xmm0
3964; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
3965; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
3966; SSE2-NEXT:    movdqa %xmm2, (%rcx)
3967; SSE2-NEXT:    retq
3968;
3969; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3970; SSE42:       # %bb.0:
3971; SSE42-NEXT:    movdqa (%rdi), %xmm0
3972; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
3973; SSE42-NEXT:    paddb (%rsi), %xmm0
3974; SSE42-NEXT:    paddb 48(%rsi), %xmm1
3975; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
3976; SSE42-NEXT:    paddb (%rdx), %xmm1
3977; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
3978; SSE42-NEXT:    paddb %xmm0, %xmm2
3979; SSE42-NEXT:    paddb 32(%rdx), %xmm0
3980; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
3981; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
3982; SSE42-NEXT:    movdqa %xmm1, (%rcx)
3983; SSE42-NEXT:    retq
3984;
3985; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3986; AVX:       # %bb.0:
3987; AVX-NEXT:    vmovdqa (%rdi), %xmm0
3988; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
3989; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
3990; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
3991; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
3992; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
3993; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
3994; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
3995; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
3996; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
3997; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
3998; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
3999; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
4000; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
4001; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
4002; AVX-NEXT:    retq
4003;
4004; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4005; AVX2:       # %bb.0:
4006; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4007; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4008; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4009; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4010; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4011; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4012; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4013; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4014; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
4015; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
4016; AVX2-NEXT:    vzeroupper
4017; AVX2-NEXT:    retq
4018;
4019; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4020; AVX512F:       # %bb.0:
4021; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4022; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4023; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
4024; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4025; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4026; AVX512F-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4027; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4028; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4029; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
4030; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
4031; AVX512F-NEXT:    vzeroupper
4032; AVX512F-NEXT:    retq
4033;
4034; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4035; AVX512DQ:       # %bb.0:
4036; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4037; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4038; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
4039; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4040; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4041; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4042; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4043; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4044; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
4045; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rcx)
4046; AVX512DQ-NEXT:    vzeroupper
4047; AVX512DQ-NEXT:    retq
4048;
4049; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4050; AVX512BW:       # %bb.0:
4051; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4052; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4053; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4054; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15]
4055; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
4056; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
4057; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
4058; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4059; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4060; AVX512BW-NEXT:    vzeroupper
4061; AVX512BW-NEXT:    retq
4062  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4063  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4064  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4065  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4066  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4067  %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
4068  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4069  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4070  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4071  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4072  ret void
4073}
4074
4075define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4076; SSE2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4077; SSE2:       # %bb.0:
4078; SSE2-NEXT:    movdqa (%rdi), %xmm0
4079; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4080; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4081; SSE2-NEXT:    paddb (%rsi), %xmm0
4082; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
4083; SSE2-NEXT:    pand %xmm2, %xmm1
4084; SSE2-NEXT:    pandn %xmm0, %xmm2
4085; SSE2-NEXT:    por %xmm1, %xmm2
4086; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4087; SSE2-NEXT:    paddb (%rdx), %xmm2
4088; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4089; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4090; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4091; SSE2-NEXT:    retq
4092;
4093; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4094; SSE42:       # %bb.0:
4095; SSE42-NEXT:    movdqa (%rdi), %xmm0
4096; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4097; SSE42-NEXT:    paddb (%rsi), %xmm0
4098; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4099; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4100; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4101; SSE42-NEXT:    paddb (%rdx), %xmm1
4102; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4103; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4104; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4105; SSE42-NEXT:    retq
4106;
4107; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4108; AVX:       # %bb.0:
4109; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4110; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4111; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4112; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4113; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4114; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4115; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4116; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4117; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4118; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4119; AVX-NEXT:    retq
4120;
4121; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4122; AVX2:       # %bb.0:
4123; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
4124; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4125; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4126; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4127; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4128; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
4129; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
4130; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4131; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4132; AVX2-NEXT:    vzeroupper
4133; AVX2-NEXT:    retq
4134;
4135; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4136; AVX512F:       # %bb.0:
4137; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
4138; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm1
4139; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4140; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4141; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4142; AVX512F-NEXT:    vpbroadcastw %xmm0, %xmm0
4143; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
4144; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4145; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4146; AVX512F-NEXT:    vzeroupper
4147; AVX512F-NEXT:    retq
4148;
4149; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4150; AVX512DQ:       # %bb.0:
4151; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
4152; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm1
4153; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4154; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4155; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4156; AVX512DQ-NEXT:    vpbroadcastw %xmm0, %xmm0
4157; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
4158; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4159; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4160; AVX512DQ-NEXT:    vzeroupper
4161; AVX512DQ-NEXT:    retq
4162;
4163; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4164; AVX512BW:       # %bb.0:
4165; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4166; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4167; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4168; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0]
4169; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
4170; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
4171; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4172; AVX512BW-NEXT:    vzeroupper
4173; AVX512BW-NEXT:    retq
4174  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4175  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4176  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4177  %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4178  %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4179  %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
4180  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4181  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4182  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4183  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4184  ret void
4185}
4186
4187define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4188; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4189; SSE2:       # %bb.0:
4190; SSE2-NEXT:    movdqa (%rdi), %xmm0
4191; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4192; SSE2-NEXT:    paddb (%rsi), %xmm0
4193; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4194; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
4195; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4196; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4197; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4198; SSE2-NEXT:    paddb (%rdx), %xmm2
4199; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
4200; SSE2-NEXT:    paddb %xmm0, %xmm1
4201; SSE2-NEXT:    paddb 32(%rdx), %xmm0
4202; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
4203; SSE2-NEXT:    movdqa %xmm1, 16(%rcx)
4204; SSE2-NEXT:    movdqa %xmm2, (%rcx)
4205; SSE2-NEXT:    retq
4206;
4207; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4208; SSE42:       # %bb.0:
4209; SSE42-NEXT:    movdqa (%rdi), %xmm0
4210; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4211; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4212; SSE42-NEXT:    paddb (%rsi), %xmm0
4213; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4214; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
4215; SSE42-NEXT:    paddb (%rdx), %xmm1
4216; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
4217; SSE42-NEXT:    paddb %xmm0, %xmm2
4218; SSE42-NEXT:    paddb 32(%rdx), %xmm0
4219; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
4220; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
4221; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4222; SSE42-NEXT:    retq
4223;
4224; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4225; AVX:       # %bb.0:
4226; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4227; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4228; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4229; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4230; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4231; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
4232; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
4233; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
4234; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
4235; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
4236; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
4237; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
4238; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
4239; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
4240; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
4241; AVX-NEXT:    vzeroupper
4242; AVX-NEXT:    retq
4243;
4244; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4245; AVX2:       # %bb.0:
4246; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
4247; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4248; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4249; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm2
4250; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4251; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
4252; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
4253; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4254; AVX2-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
4255; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4256; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
4257; AVX2-NEXT:    vzeroupper
4258; AVX2-NEXT:    retq
4259;
4260; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4261; AVX512F:       # %bb.0:
4262; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4263; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4264; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4265; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4266; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4267; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4268; AVX512F-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4269; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4270; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4271; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4272; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4273; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
4274; AVX512F-NEXT:    vzeroupper
4275; AVX512F-NEXT:    retq
4276;
4277; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4278; AVX512DQ:       # %bb.0:
4279; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4280; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4281; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4282; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4283; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4284; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4285; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4286; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4287; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4288; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4289; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4290; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
4291; AVX512DQ-NEXT:    vzeroupper
4292; AVX512DQ-NEXT:    retq
4293;
4294; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4295; AVX512BW:       # %bb.0:
4296; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
4297; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4298; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4299; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4300; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4301; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4302; AVX512BW-NEXT:    vzeroupper
4303; AVX512BW-NEXT:    retq
4304  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4305  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4306  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4307  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4308  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 0, i32 15, i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23>
4309  %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4310  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4311  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4312  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4313  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4314  ret void
4315}
4316
4317define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4318; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4319; SSE2:       # %bb.0:
4320; SSE2-NEXT:    movdqa (%rdi), %xmm0
4321; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4322; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4323; SSE2-NEXT:    paddb (%rsi), %xmm0
4324; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4325; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
4326; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2]
4327; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
4328; SSE2-NEXT:    paddb (%rdx), %xmm0
4329; SSE2-NEXT:    paddb 16(%rdx), %xmm3
4330; SSE2-NEXT:    paddb 32(%rdx), %xmm2
4331; SSE2-NEXT:    movdqa %xmm2, 32(%rcx)
4332; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
4333; SSE2-NEXT:    movdqa %xmm0, (%rcx)
4334; SSE2-NEXT:    retq
4335;
4336; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4337; SSE42:       # %bb.0:
4338; SSE42-NEXT:    movdqa (%rdi), %xmm0
4339; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4340; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4341; SSE42-NEXT:    paddb (%rsi), %xmm0
4342; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4343; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
4344; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4345; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4346; SSE42-NEXT:    paddb (%rdx), %xmm2
4347; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4348; SSE42-NEXT:    paddb 32(%rdx), %xmm1
4349; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
4350; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4351; SSE42-NEXT:    movdqa %xmm2, (%rcx)
4352; SSE42-NEXT:    retq
4353;
4354; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4355; AVX:       # %bb.0:
4356; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4357; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4358; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4359; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4360; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4361; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4362; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4363; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4364; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4365; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
4366; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4367; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4368; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
4369; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4370; AVX-NEXT:    retq
4371;
4372; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4373; AVX2:       # %bb.0:
4374; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4375; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
4376; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4377; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4378; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4379; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4380; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
4381; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
4382; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4383; AVX2-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
4384; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4385; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
4386; AVX2-NEXT:    vzeroupper
4387; AVX2-NEXT:    retq
4388;
4389; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4390; AVX512F:       # %bb.0:
4391; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4392; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4393; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4394; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4395; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4396; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4397; AVX512F-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4398; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4399; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4400; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4401; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4402; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
4403; AVX512F-NEXT:    vzeroupper
4404; AVX512F-NEXT:    retq
4405;
4406; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4407; AVX512DQ:       # %bb.0:
4408; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4409; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4410; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4411; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4412; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4413; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4414; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4415; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4416; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4417; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4418; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4419; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
4420; AVX512DQ-NEXT:    vzeroupper
4421; AVX512DQ-NEXT:    retq
4422;
4423; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4424; AVX512BW-SLOW:       # %bb.0:
4425; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4426; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4427; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4428; AVX512BW-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
4429; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4430; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4431; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4432; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4433; AVX512BW-SLOW-NEXT:    vzeroupper
4434; AVX512BW-SLOW-NEXT:    retq
4435;
4436; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4437; AVX512BW-FAST:       # %bb.0:
4438; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4439; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4440; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4441; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4442; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4443; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4444; AVX512BW-FAST-NEXT:    vzeroupper
4445; AVX512BW-FAST-NEXT:    retq
4446  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4447  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4448  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4449  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4450  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 0, i32 16, i32 17, i32 0, i32 19, i32 20, i32 0, i32 22, i32 23>
4451  %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4452  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4453  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4454  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4455  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4456  ret void
4457}
4458
4459define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4460; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4461; SSE2:       # %bb.0:
4462; SSE2-NEXT:    movdqa (%rdi), %xmm0
4463; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4464; SSE2-NEXT:    paddb (%rsi), %xmm0
4465; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4466; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
4467; SSE2-NEXT:    paddb (%rdx), %xmm1
4468; SSE2-NEXT:    movdqa 16(%rdx), %xmm2
4469; SSE2-NEXT:    paddb %xmm0, %xmm2
4470; SSE2-NEXT:    paddb 32(%rdx), %xmm0
4471; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
4472; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
4473; SSE2-NEXT:    movdqa %xmm1, (%rcx)
4474; SSE2-NEXT:    retq
4475;
4476; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4477; SSE42:       # %bb.0:
4478; SSE42-NEXT:    movdqa (%rdi), %xmm0
4479; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4480; SSE42-NEXT:    paddb (%rsi), %xmm0
4481; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4482; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4483; SSE42-NEXT:    paddb (%rdx), %xmm1
4484; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
4485; SSE42-NEXT:    paddb %xmm0, %xmm2
4486; SSE42-NEXT:    paddb 32(%rdx), %xmm0
4487; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
4488; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
4489; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4490; SSE42-NEXT:    retq
4491;
4492; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4493; AVX:       # %bb.0:
4494; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4495; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
4496; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
4497; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
4498; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4499; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
4500; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4501; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
4502; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
4503; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
4504; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4505; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4506; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
4507; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
4508; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
4509; AVX-NEXT:    retq
4510;
4511; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4512; AVX2:       # %bb.0:
4513; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4514; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4515; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
4516; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4517; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4518; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
4519; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4520; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4521; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
4522; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
4523; AVX2-NEXT:    vzeroupper
4524; AVX2-NEXT:    retq
4525;
4526; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4527; AVX512F:       # %bb.0:
4528; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4529; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4530; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4531; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4532; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4533; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4534; AVX512F-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4535; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4536; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4537; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4538; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4539; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
4540; AVX512F-NEXT:    vzeroupper
4541; AVX512F-NEXT:    retq
4542;
4543; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4544; AVX512DQ:       # %bb.0:
4545; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4546; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4547; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4548; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4549; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4550; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4551; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4552; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4553; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4554; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4555; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4556; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
4557; AVX512DQ-NEXT:    vzeroupper
4558; AVX512DQ-NEXT:    retq
4559;
4560; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4561; AVX512BW-SLOW:       # %bb.0:
4562; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4563; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4564; AVX512BW-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
4565; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4566; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
4567; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4568; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4569; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4570; AVX512BW-SLOW-NEXT:    vzeroupper
4571; AVX512BW-SLOW-NEXT:    retq
4572;
4573; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4574; AVX512BW-FAST:       # %bb.0:
4575; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4576; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4577; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4578; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
4579; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4580; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4581; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4582; AVX512BW-FAST-NEXT:    vzeroupper
4583; AVX512BW-FAST-NEXT:    retq
4584  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4585  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4586  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4587  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4588  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23>
4589  %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4590  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4591  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4592  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4593  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4594  ret void
4595}
4596
4597define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4598; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4599; SSE2:       # %bb.0:
4600; SSE2-NEXT:    movdqa (%rdi), %xmm0
4601; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4602; SSE2-NEXT:    paddb (%rsi), %xmm0
4603; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4604; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
4605; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4606; SSE2-NEXT:    paddb (%rdx), %xmm1
4607; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4608; SSE2-NEXT:    movdqa %xmm1, (%rcx)
4609; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4610; SSE2-NEXT:    retq
4611;
4612; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4613; SSE42:       # %bb.0:
4614; SSE42-NEXT:    movdqa (%rdi), %xmm0
4615; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4616; SSE42-NEXT:    paddb (%rsi), %xmm0
4617; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4618; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4619; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4620; SSE42-NEXT:    paddb (%rdx), %xmm1
4621; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4622; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4623; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4624; SSE42-NEXT:    retq
4625;
4626; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4627; AVX:       # %bb.0:
4628; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4629; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4630; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4631; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4632; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4633; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4634; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
4635; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
4636; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
4637; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
4638; AVX-NEXT:    retq
4639;
4640; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4641; AVX2:       # %bb.0:
4642; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4643; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
4644; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4645; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4646; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4647; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7]
4648; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
4649; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4650; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4651; AVX2-NEXT:    vzeroupper
4652; AVX2-NEXT:    retq
4653;
4654; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4655; AVX512F:       # %bb.0:
4656; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4657; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4658; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4659; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4660; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4661; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4662; AVX512F-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4663; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4664; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4665; AVX512F-NEXT:    vzeroupper
4666; AVX512F-NEXT:    retq
4667;
4668; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4669; AVX512DQ:       # %bb.0:
4670; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4671; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4672; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4673; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4674; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4675; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4676; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4677; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4678; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4679; AVX512DQ-NEXT:    vzeroupper
4680; AVX512DQ-NEXT:    retq
4681;
4682; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4683; AVX512BW-SLOW:       # %bb.0:
4684; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4685; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4686; AVX512BW-SLOW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4687; AVX512BW-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4688; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4689; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4690; AVX512BW-SLOW-NEXT:    vzeroupper
4691; AVX512BW-SLOW-NEXT:    retq
4692;
4693; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4694; AVX512BW-FAST:       # %bb.0:
4695; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4696; AVX512BW-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4697; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4698; AVX512BW-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
4699; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
4700; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4701; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4702; AVX512BW-FAST-NEXT:    vzeroupper
4703; AVX512BW-FAST-NEXT:    retq
4704  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4705  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4706  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4707  %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4708  %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 0, i32 19, i32 20, i32 21, i32 22, i32 23>
4709  %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4710  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4711  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4712  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4713  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4714  ret void
4715}
4716
4717define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4718; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4719; SSE2:       # %bb.0:
4720; SSE2-NEXT:    movdqa (%rdi), %xmm0
4721; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4722; SSE2-NEXT:    paddb (%rsi), %xmm0
4723; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4724; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4725; SSE2-NEXT:    paddb (%rdx), %xmm1
4726; SSE2-NEXT:    movdqa 16(%rdx), %xmm2
4727; SSE2-NEXT:    paddb %xmm0, %xmm2
4728; SSE2-NEXT:    paddb 32(%rdx), %xmm0
4729; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
4730; SSE2-NEXT:    movdqa %xmm2, 16(%rcx)
4731; SSE2-NEXT:    movdqa %xmm1, (%rcx)
4732; SSE2-NEXT:    retq
4733;
4734; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4735; SSE42:       # %bb.0:
4736; SSE42-NEXT:    movdqa (%rdi), %xmm0
4737; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4738; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4739; SSE42-NEXT:    paddb (%rsi), %xmm0
4740; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4741; SSE42-NEXT:    paddb (%rdx), %xmm1
4742; SSE42-NEXT:    movdqa 16(%rdx), %xmm2
4743; SSE42-NEXT:    paddb %xmm0, %xmm2
4744; SSE42-NEXT:    paddb 32(%rdx), %xmm0
4745; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
4746; SSE42-NEXT:    movdqa %xmm2, 16(%rcx)
4747; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4748; SSE42-NEXT:    retq
4749;
4750; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4751; AVX:       # %bb.0:
4752; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4753; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
4754; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
4755; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
4756; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
4757; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4758; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm3
4759; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
4760; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
4761; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
4762; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
4763; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
4764; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
4765; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
4766; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
4767; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
4768; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
4769; AVX-NEXT:    vzeroupper
4770; AVX-NEXT:    retq
4771;
4772; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4773; AVX2:       # %bb.0:
4774; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4775; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
4776; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4777; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4778; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
4779; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
4780; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
4781; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
4782; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
4783; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
4784; AVX2-NEXT:    vzeroupper
4785; AVX2-NEXT:    retq
4786;
4787; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4788; AVX512F:       # %bb.0:
4789; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4790; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4791; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4792; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4793; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4794; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
4795; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
4796; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4797; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4798; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4799; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4800; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
4801; AVX512F-NEXT:    vzeroupper
4802; AVX512F-NEXT:    retq
4803;
4804; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4805; AVX512DQ:       # %bb.0:
4806; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4807; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4808; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4809; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4810; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4811; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
4812; AVX512DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
4813; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4814; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
4815; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4816; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4817; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
4818; AVX512DQ-NEXT:    vzeroupper
4819; AVX512DQ-NEXT:    retq
4820;
4821; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4822; AVX512BW-SLOW:       # %bb.0:
4823; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4824; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4825; AVX512BW-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4826; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
4827; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
4828; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4829; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4830; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4831; AVX512BW-SLOW-NEXT:    vzeroupper
4832; AVX512BW-SLOW-NEXT:    retq
4833;
4834; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4835; AVX512BW-FAST:       # %bb.0:
4836; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4837; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
4838; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4839; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm1
4840; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4841; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4842; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4843; AVX512BW-FAST-NEXT:    vzeroupper
4844; AVX512BW-FAST-NEXT:    retq
4845  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4846  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4847  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4848  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
4849  %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <6 x i32> <i32 0, i32 7, i32 0, i32 9, i32 0, i32 11>
4850  %out.bytevec = bitcast <6 x i64> %broadcast.of.aextinreg to <48 x i8>
4851  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4852  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4853  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4854  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4855  ret void
4856}
4857
4858define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4859; SSE2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4860; SSE2:       # %bb.0:
4861; SSE2-NEXT:    movdqa (%rdi), %xmm0
4862; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
4863; SSE2-NEXT:    paddb (%rsi), %xmm0
4864; SSE2-NEXT:    paddb 48(%rsi), %xmm1
4865; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4866; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4867; SSE2-NEXT:    paddb (%rdx), %xmm1
4868; SSE2-NEXT:    paddb 16(%rdx), %xmm0
4869; SSE2-NEXT:    movdqa %xmm1, (%rcx)
4870; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
4871; SSE2-NEXT:    retq
4872;
4873; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4874; SSE42:       # %bb.0:
4875; SSE42-NEXT:    movdqa (%rdi), %xmm0
4876; SSE42-NEXT:    movdqa 48(%rdi), %xmm1
4877; SSE42-NEXT:    paddb 48(%rsi), %xmm1
4878; SSE42-NEXT:    paddb (%rsi), %xmm0
4879; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4880; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4881; SSE42-NEXT:    paddb (%rdx), %xmm1
4882; SSE42-NEXT:    paddb 16(%rdx), %xmm0
4883; SSE42-NEXT:    movdqa %xmm1, (%rcx)
4884; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
4885; SSE42-NEXT:    retq
4886;
4887; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4888; AVX:       # %bb.0:
4889; AVX-NEXT:    vmovdqa (%rdi), %xmm0
4890; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
4891; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
4892; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
4893; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
4894; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
4895; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
4896; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
4897; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
4898; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
4899; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
4900; AVX-NEXT:    vzeroupper
4901; AVX-NEXT:    retq
4902;
4903; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4904; AVX2:       # %bb.0:
4905; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4906; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
4907; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4908; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4909; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
4910; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0]
4911; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4912; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
4913; AVX2-NEXT:    vzeroupper
4914; AVX2-NEXT:    retq
4915;
4916; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4917; AVX512F:       # %bb.0:
4918; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
4919; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
4920; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4921; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4922; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4923; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
4924; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
4925; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4926; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
4927; AVX512F-NEXT:    vzeroupper
4928; AVX512F-NEXT:    retq
4929;
4930; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4931; AVX512DQ:       # %bb.0:
4932; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
4933; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
4934; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
4935; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
4936; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4937; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
4938; AVX512DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
4939; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
4940; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
4941; AVX512DQ-NEXT:    vzeroupper
4942; AVX512DQ-NEXT:    retq
4943;
4944; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4945; AVX512BW-SLOW:       # %bb.0:
4946; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
4947; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4948; AVX512BW-SLOW-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [0,7]
4949; AVX512BW-SLOW-NEXT:    vpermq %zmm0, %zmm1, %zmm0
4950; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4951; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
4952; AVX512BW-SLOW-NEXT:    vzeroupper
4953; AVX512BW-SLOW-NEXT:    retq
4954;
4955; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4956; AVX512BW-FAST:       # %bb.0:
4957; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
4958; AVX512BW-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
4959; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
4960; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
4961; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
4962; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
4963; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
4964; AVX512BW-FAST-NEXT:    vzeroupper
4965; AVX512BW-FAST-NEXT:    retq
4966  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4967  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4968  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4969  %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
4970  %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <6 x i32> <i32 0, i32 7, i32 8, i32 0, i32 10, i32 11>
4971  %out.bytevec = bitcast <6 x i64> %broadcast.of.aextinreg to <48 x i8>
4972  %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4973  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4974  %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4975  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4976  ret void
4977}
4978
4979define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4980; SSE-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
4981; SSE:       # %bb.0:
4982; SSE-NEXT:    movdqa (%rdi), %xmm0
4983; SSE-NEXT:    paddb (%rsi), %xmm0
4984; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
4985; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4986; SSE-NEXT:    movdqa 16(%rdx), %xmm1
4987; SSE-NEXT:    paddb %xmm0, %xmm1
4988; SSE-NEXT:    movdqa (%rdx), %xmm2
4989; SSE-NEXT:    paddb %xmm0, %xmm2
4990; SSE-NEXT:    movdqa 48(%rdx), %xmm3
4991; SSE-NEXT:    paddb %xmm0, %xmm3
4992; SSE-NEXT:    paddb 32(%rdx), %xmm0
4993; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
4994; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
4995; SSE-NEXT:    movdqa %xmm2, (%rcx)
4996; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
4997; SSE-NEXT:    retq
4998;
4999; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5000; AVX:       # %bb.0:
5001; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5002; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5003; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5004; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5005; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5006; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5007; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5008; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5009; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5010; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5011; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5012; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5013; AVX-NEXT:    retq
5014;
5015; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5016; AVX2:       # %bb.0:
5017; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5018; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5019; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
5020; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5021; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5022; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5023; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5024; AVX2-NEXT:    vzeroupper
5025; AVX2-NEXT:    retq
5026;
5027; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5028; AVX512F:       # %bb.0:
5029; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5030; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5031; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
5032; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5033; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5034; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5035; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5036; AVX512F-NEXT:    vzeroupper
5037; AVX512F-NEXT:    retq
5038;
5039; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5040; AVX512DQ:       # %bb.0:
5041; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
5042; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5043; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
5044; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5045; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5046; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5047; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5048; AVX512DQ-NEXT:    vzeroupper
5049; AVX512DQ-NEXT:    retq
5050;
5051; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5052; AVX512BW:       # %bb.0:
5053; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5054; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5055; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
5056; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5057; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5058; AVX512BW-NEXT:    vzeroupper
5059; AVX512BW-NEXT:    retq
5060  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5061  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5062  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5063  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95, i32 0, i32 97, i32 0, i32 99, i32 0, i32 101, i32 0, i32 103, i32 0, i32 105, i32 0, i32 107, i32 0, i32 109, i32 0, i32 111, i32 0, i32 113, i32 0, i32 115, i32 0, i32 117, i32 0, i32 119, i32 0, i32 121, i32 0, i32 123, i32 0, i32 125, i32 0, i32 127>
5064  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5065  %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5066  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5067  ret void
5068}
5069
5070define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5071; SSE-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5072; SSE:       # %bb.0:
5073; SSE-NEXT:    movdqa (%rdi), %xmm0
5074; SSE-NEXT:    paddb (%rsi), %xmm0
5075; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5076; SSE-NEXT:    movdqa 16(%rdx), %xmm1
5077; SSE-NEXT:    paddb %xmm0, %xmm1
5078; SSE-NEXT:    movdqa (%rdx), %xmm2
5079; SSE-NEXT:    paddb %xmm0, %xmm2
5080; SSE-NEXT:    movdqa 48(%rdx), %xmm3
5081; SSE-NEXT:    paddb %xmm0, %xmm3
5082; SSE-NEXT:    paddb 32(%rdx), %xmm0
5083; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
5084; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
5085; SSE-NEXT:    movdqa %xmm2, (%rcx)
5086; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
5087; SSE-NEXT:    retq
5088;
5089; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5090; AVX:       # %bb.0:
5091; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5092; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5093; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5094; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5095; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5096; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5097; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5098; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5099; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5100; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5101; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5102; AVX-NEXT:    retq
5103;
5104; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5105; AVX2:       # %bb.0:
5106; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5107; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5108; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
5109; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5110; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5111; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5112; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5113; AVX2-NEXT:    vzeroupper
5114; AVX2-NEXT:    retq
5115;
5116; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5117; AVX512F:       # %bb.0:
5118; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5119; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5120; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
5121; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5122; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5123; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5124; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5125; AVX512F-NEXT:    vzeroupper
5126; AVX512F-NEXT:    retq
5127;
5128; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5129; AVX512DQ:       # %bb.0:
5130; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
5131; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5132; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
5133; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5134; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5135; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5136; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5137; AVX512DQ-NEXT:    vzeroupper
5138; AVX512DQ-NEXT:    retq
5139;
5140; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5141; AVX512BW:       # %bb.0:
5142; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5143; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5144; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
5145; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5146; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5147; AVX512BW-NEXT:    vzeroupper
5148; AVX512BW-NEXT:    retq
5149  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5150  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5151  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5152  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 0, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 0, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 0, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 0, i32 125, i32 126, i32 127>
5153  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5154  %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5155  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5156  ret void
5157}
5158
5159define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5160; SSE-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5161; SSE:       # %bb.0:
5162; SSE-NEXT:    movdqa (%rdi), %xmm0
5163; SSE-NEXT:    paddb (%rsi), %xmm0
5164; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5165; SSE-NEXT:    movdqa 16(%rdx), %xmm1
5166; SSE-NEXT:    paddb %xmm0, %xmm1
5167; SSE-NEXT:    movdqa (%rdx), %xmm2
5168; SSE-NEXT:    paddb %xmm0, %xmm2
5169; SSE-NEXT:    movdqa 48(%rdx), %xmm3
5170; SSE-NEXT:    paddb %xmm0, %xmm3
5171; SSE-NEXT:    paddb 32(%rdx), %xmm0
5172; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
5173; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
5174; SSE-NEXT:    movdqa %xmm2, (%rcx)
5175; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
5176; SSE-NEXT:    retq
5177;
5178; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5179; AVX:       # %bb.0:
5180; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5181; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5182; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5183; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5184; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5185; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5186; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5187; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5188; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5189; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5190; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5191; AVX-NEXT:    retq
5192;
5193; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5194; AVX2:       # %bb.0:
5195; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5196; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5197; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
5198; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5199; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5200; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5201; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5202; AVX2-NEXT:    vzeroupper
5203; AVX2-NEXT:    retq
5204;
5205; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5206; AVX512F:       # %bb.0:
5207; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5208; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5209; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
5210; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5211; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5212; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5213; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5214; AVX512F-NEXT:    vzeroupper
5215; AVX512F-NEXT:    retq
5216;
5217; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5218; AVX512DQ:       # %bb.0:
5219; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
5220; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5221; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
5222; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5223; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5224; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5225; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5226; AVX512DQ-NEXT:    vzeroupper
5227; AVX512DQ-NEXT:    retq
5228;
5229; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5230; AVX512BW:       # %bb.0:
5231; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5232; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5233; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
5234; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5235; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5236; AVX512BW-NEXT:    vzeroupper
5237; AVX512BW-NEXT:    retq
5238  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5239  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5240  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5241  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5242  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5243  %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5244  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5245  ret void
5246}
5247
5248define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5249; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5250; SSE:       # %bb.0:
5251; SSE-NEXT:    movdqa (%rdi), %xmm0
5252; SSE-NEXT:    paddb (%rsi), %xmm0
5253; SSE-NEXT:    movdqa 16(%rdx), %xmm1
5254; SSE-NEXT:    paddb %xmm0, %xmm1
5255; SSE-NEXT:    movdqa (%rdx), %xmm2
5256; SSE-NEXT:    paddb %xmm0, %xmm2
5257; SSE-NEXT:    movdqa 48(%rdx), %xmm3
5258; SSE-NEXT:    paddb %xmm0, %xmm3
5259; SSE-NEXT:    paddb 32(%rdx), %xmm0
5260; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
5261; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
5262; SSE-NEXT:    movdqa %xmm2, (%rcx)
5263; SSE-NEXT:    movdqa %xmm1, 16(%rcx)
5264; SSE-NEXT:    retq
5265;
5266; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5267; AVX:       # %bb.0:
5268; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5269; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5270; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm1
5271; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
5272; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm3
5273; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5274; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5275; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
5276; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
5277; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
5278; AVX-NEXT:    retq
5279;
5280; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5281; AVX2:       # %bb.0:
5282; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
5283; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5284; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
5285; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5286; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5287; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5288; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5289; AVX2-NEXT:    vzeroupper
5290; AVX2-NEXT:    retq
5291;
5292; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5293; AVX512F:       # %bb.0:
5294; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
5295; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5296; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
5297; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5298; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5299; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5300; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5301; AVX512F-NEXT:    vzeroupper
5302; AVX512F-NEXT:    retq
5303;
5304; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5305; AVX512DQ:       # %bb.0:
5306; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
5307; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5308; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
5309; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5310; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5311; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5312; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5313; AVX512DQ-NEXT:    vzeroupper
5314; AVX512DQ-NEXT:    retq
5315;
5316; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5317; AVX512BW:       # %bb.0:
5318; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5319; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5320; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
5321; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5322; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5323; AVX512BW-NEXT:    vzeroupper
5324; AVX512BW-NEXT:    retq
5325  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5326  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5327  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5328  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5329  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5330  %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5331  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5332  ret void
5333}
5334
5335define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5336; SSE-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5337; SSE:       # %bb.0:
5338; SSE-NEXT:    movdqa (%rdi), %xmm0
5339; SSE-NEXT:    movdqa 16(%rdi), %xmm1
5340; SSE-NEXT:    paddb (%rsi), %xmm0
5341; SSE-NEXT:    paddb 16(%rsi), %xmm1
5342; SSE-NEXT:    movdqa 16(%rdx), %xmm2
5343; SSE-NEXT:    paddb %xmm1, %xmm2
5344; SSE-NEXT:    movdqa (%rdx), %xmm3
5345; SSE-NEXT:    paddb %xmm0, %xmm3
5346; SSE-NEXT:    paddb 48(%rdx), %xmm1
5347; SSE-NEXT:    paddb 32(%rdx), %xmm0
5348; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
5349; SSE-NEXT:    movdqa %xmm1, 48(%rcx)
5350; SSE-NEXT:    movdqa %xmm3, (%rcx)
5351; SSE-NEXT:    movdqa %xmm2, 16(%rcx)
5352; SSE-NEXT:    retq
5353;
5354; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5355; AVX:       # %bb.0:
5356; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5357; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
5358; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5359; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
5360; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm2
5361; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
5362; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
5363; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5364; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5365; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
5366; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
5367; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
5368; AVX-NEXT:    retq
5369;
5370; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5371; AVX2:       # %bb.0:
5372; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5373; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5374; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5375; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5376; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5377; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5378; AVX2-NEXT:    vzeroupper
5379; AVX2-NEXT:    retq
5380;
5381; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5382; AVX512F:       # %bb.0:
5383; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5384; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5385; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5386; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5387; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5388; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5389; AVX512F-NEXT:    vzeroupper
5390; AVX512F-NEXT:    retq
5391;
5392; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5393; AVX512DQ:       # %bb.0:
5394; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5395; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5396; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5397; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5398; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5399; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5400; AVX512DQ-NEXT:    vzeroupper
5401; AVX512DQ-NEXT:    retq
5402;
5403; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5404; AVX512BW:       # %bb.0:
5405; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
5406; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5407; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
5408; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5409; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5410; AVX512BW-NEXT:    vzeroupper
5411; AVX512BW-NEXT:    retq
5412  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5413  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5414  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5415  %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5416  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5417  %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5418  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5419  ret void
5420}
5421
5422; FIXME: all these crash during selection:
5423; define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5424;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5425;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5426;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5427;   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5428;   %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
5429;   %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5430;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5431;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5432;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5433;   ret void
5434; }
5435;
5436; define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5437;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5438;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5439;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5440;   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5441;   %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
5442;   %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5443;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5444;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5445;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5446;   ret void
5447; }
5448;
5449; define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5450;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5451;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5452;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5453;   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5454;   %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5455;   %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5456;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5457;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5458;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5459;   ret void
5460; }
5461;
5462; define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5463;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5464;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5465;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5466;   %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5467;   %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5468;   %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5469;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5470;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5471;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5472;   ret void
5473; }
5474;
5475; define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5476;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5477;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5478;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5479;   %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5480;   %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
5481;   %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8>
5482;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5483;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5484;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5485;   ret void
5486; }
5487;
5488; define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5489;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5490;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5491;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5492;   %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5493;   %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
5494;   %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8>
5495;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5496;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5497;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5498;   ret void
5499; }
5500;
5501; define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5502;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5503;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5504;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5505;   %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5506;   %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
5507;   %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8>
5508;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5509;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5510;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5511;   ret void
5512; }
5513;
5514; define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5515;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5516;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5517;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5518;   %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5519;   %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
5520;   %out.bytevec = bitcast <8 x i64> %broadcast.of.aextinreg to <64 x i8>
5521;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5522;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5523;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5524;   ret void
5525; }
5526;
5527; define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5528;   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5529;   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5530;   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5531;   %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5532;   %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
5533;   %out.bytevec = bitcast <8 x i64> %broadcast.of.aextinreg to <64 x i8>
5534;   %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5535;   %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5536;   store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5537;   ret void
5538; }
5539
5540define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5541; SSE-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5542; SSE:       # %bb.0:
5543; SSE-NEXT:    movdqa (%rdi), %xmm0
5544; SSE-NEXT:    movdqa 16(%rdi), %xmm1
5545; SSE-NEXT:    paddb (%rsi), %xmm0
5546; SSE-NEXT:    paddb 16(%rsi), %xmm1
5547; SSE-NEXT:    movdqa 16(%rdx), %xmm2
5548; SSE-NEXT:    paddb %xmm1, %xmm2
5549; SSE-NEXT:    movdqa (%rdx), %xmm3
5550; SSE-NEXT:    paddb %xmm0, %xmm3
5551; SSE-NEXT:    paddb 48(%rdx), %xmm1
5552; SSE-NEXT:    paddb 32(%rdx), %xmm0
5553; SSE-NEXT:    movdqa %xmm0, 32(%rcx)
5554; SSE-NEXT:    movdqa %xmm1, 48(%rcx)
5555; SSE-NEXT:    movdqa %xmm3, (%rcx)
5556; SSE-NEXT:    movdqa %xmm2, 16(%rcx)
5557; SSE-NEXT:    retq
5558;
5559; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5560; AVX:       # %bb.0:
5561; AVX-NEXT:    vmovdqa (%rdi), %xmm0
5562; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
5563; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
5564; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
5565; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm2
5566; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm3
5567; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
5568; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
5569; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
5570; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
5571; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
5572; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
5573; AVX-NEXT:    retq
5574;
5575; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5576; AVX2:       # %bb.0:
5577; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5578; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5579; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5580; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5581; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5582; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5583; AVX2-NEXT:    vzeroupper
5584; AVX2-NEXT:    retq
5585;
5586; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5587; AVX512F:       # %bb.0:
5588; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5589; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5590; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5591; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5592; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
5593; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
5594; AVX512F-NEXT:    vzeroupper
5595; AVX512F-NEXT:    retq
5596;
5597; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5598; AVX512DQ:       # %bb.0:
5599; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
5600; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
5601; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
5602; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
5603; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
5604; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rcx)
5605; AVX512DQ-NEXT:    vzeroupper
5606; AVX512DQ-NEXT:    retq
5607;
5608; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5609; AVX512BW:       # %bb.0:
5610; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
5611; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
5612; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5613; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
5614; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
5615; AVX512BW-NEXT:    vzeroupper
5616; AVX512BW-NEXT:    retq
5617  %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5618  %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5619  %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5620  %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
5621  %broadcast.of.aextinreg = shufflevector <4 x i128> %in.vec.cast, <4 x i128> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
5622  %out.bytevec = bitcast <4 x i128> %broadcast.of.aextinreg to <64 x i8>
5623  %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5624  %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5625  store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5626  ret void
5627}
5628;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
5629; AVX1-ONLY: {{.*}}
5630; FALLBACK0: {{.*}}
5631; FALLBACK1: {{.*}}
5632; FALLBACK10: {{.*}}
5633; FALLBACK11: {{.*}}
5634; FALLBACK12: {{.*}}
5635; FALLBACK13: {{.*}}
5636; FALLBACK2: {{.*}}
5637; FALLBACK3: {{.*}}
5638; FALLBACK4: {{.*}}
5639; FALLBACK5: {{.*}}
5640; FALLBACK6: {{.*}}
5641; FALLBACK7: {{.*}}
5642; FALLBACK8: {{.*}}
5643; FALLBACK9: {{.*}}
5644