xref: /llvm-project/llvm/test/CodeGen/X86/dpbusd.ll (revision c18a3b6bd30456305cf1b3d78ad5a805577388c1)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI
5
6define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) {
7; AVXVNNI-LABEL: no_dpbusd:
8; AVXVNNI:       # %bb.0: # %entry
9; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
10; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
11; AVXVNNI-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
12; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
13; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
14; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
15; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
16; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
17; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
18; AVXVNNI-NEXT:    vmovd %xmm0, %eax
19; AVXVNNI-NEXT:    addl %edx, %eax
20; AVXVNNI-NEXT:    vzeroupper
21; AVXVNNI-NEXT:    retq
22;
23; AVX512-LABEL: no_dpbusd:
24; AVX512:       # %bb.0: # %entry
25; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
26; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
27; AVX512-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
28; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
29; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
30; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
31; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
32; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
33; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
34; AVX512-NEXT:    vmovd %xmm0, %eax
35; AVX512-NEXT:    addl %edx, %eax
36; AVX512-NEXT:    vzeroupper
37; AVX512-NEXT:    retq
38entry:
39  %0 = load <16 x i8>, ptr %a, align 16
40  %1 = zext <16 x i8> %0 to <16 x i32>
41  %2 = load <16 x i8>, ptr %b, align 16
42  %3 = zext <16 x i8> %2 to <16 x i32>
43  %4 = mul nsw <16 x i32> %3, %1
44  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
45  %op.extra = add nsw i32 %5, %c
46  ret i32 %op.extra
47}
48
49define i32 @vpdpbusd_mutate(ptr%a, ptr%b, i32 %c, i32 %n) {
50; AVXVNNI-LABEL: vpdpbusd_mutate:
51; AVXVNNI:       # %bb.0: # %entry
52; AVXVNNI-NEXT:    vmovdqa (%rsi), %xmm0
53; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
54; AVXVNNI-NEXT:    {vex} vpdpbusd (%rdi), %xmm0, %xmm1
55; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
56; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
57; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
58; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
59; AVXVNNI-NEXT:    vmovd %xmm0, %eax
60; AVXVNNI-NEXT:    addl %edx, %eax
61; AVXVNNI-NEXT:    retq
62;
63; AVX512VNNI-LABEL: vpdpbusd_mutate:
64; AVX512VNNI:       # %bb.0: # %entry
65; AVX512VNNI-NEXT:    vmovdqa (%rdi), %xmm0
66; AVX512VNNI-NEXT:    vmovdqa (%rsi), %xmm1
67; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
68; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
69; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
70; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
71; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
72; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
73; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
74; AVX512VNNI-NEXT:    addl %edx, %eax
75; AVX512VNNI-NEXT:    vzeroupper
76; AVX512VNNI-NEXT:    retq
77;
78; AVX512VLVNNI-LABEL: vpdpbusd_mutate:
79; AVX512VLVNNI:       # %bb.0: # %entry
80; AVX512VLVNNI-NEXT:    vmovdqa (%rsi), %xmm0
81; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
82; AVX512VLVNNI-NEXT:    vpdpbusd (%rdi), %xmm0, %xmm1
83; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
84; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
85; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
86; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
87; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
88; AVX512VLVNNI-NEXT:    addl %edx, %eax
89; AVX512VLVNNI-NEXT:    retq
90entry:
91  %0 = load <16 x i8>, ptr %a, align 16
92  %1 = sext <16 x i8> %0 to <16 x i32>
93  %2 = load <16 x i8>, ptr %b, align 16
94  %3 = zext <16 x i8> %2 to <16 x i32>
95  %4 = mul nsw <16 x i32> %3, %1
96  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
97  %op.extra = add nsw i32 %5, %c
98  ret i32 %op.extra
99}
100
101define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) {
102; AVXVNNI-LABEL: mul_zext:
103; AVXVNNI:       # %bb.0: # %entry
104; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
105; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
106; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
107; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
108; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
109; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
110; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
111; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
112; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
113; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
114; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
115; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
116; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
117; AVXVNNI-NEXT:    vmovd %xmm0, %eax
118; AVXVNNI-NEXT:    addl %edx, %eax
119; AVXVNNI-NEXT:    vzeroupper
120; AVXVNNI-NEXT:    retq
121;
122; AVX512-LABEL: mul_zext:
123; AVX512:       # %bb.0: # %entry
124; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
125; AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
126; AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
127; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
128; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
129; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
130; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
131; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
132; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
133; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
134; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
135; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
136; AVX512-NEXT:    vmovd %xmm0, %eax
137; AVX512-NEXT:    addl %edx, %eax
138; AVX512-NEXT:    vzeroupper
139; AVX512-NEXT:    retq
140entry:
141  %0 = load <16 x i8>, ptr %a, align 16
142  %1 = zext <16 x i8> %0 to <16 x i16>
143  %2 = load <16 x i8>, ptr %b, align 16
144  %3 = sext <16 x i8> %2 to <16 x i16>
145  %4 = mul nsw <16 x i16> %3, %1
146  ; We can't combine to vpdpbusd for zext, because each of the 4 multiplies
147  ; done by vpdpbusd compute a signed 16-bit product that will be sign extended
148  ; before adding into the accumulator.
149  %5 = zext <16 x i16> %4 to <16 x i32>
150  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
151  %op.extra = add nsw i32 %6, %c
152  ret i32 %op.extra
153}
154
155define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) {
156; AVXVNNI-LABEL: mul_sext:
157; AVXVNNI:       # %bb.0: # %entry
158; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
159; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
160; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
161; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
162; AVXVNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
163; AVXVNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
164; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
165; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
166; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
167; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
168; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
169; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
170; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
171; AVXVNNI-NEXT:    vmovd %xmm0, %eax
172; AVXVNNI-NEXT:    addl %edx, %eax
173; AVXVNNI-NEXT:    vzeroupper
174; AVXVNNI-NEXT:    retq
175;
176; AVX512-LABEL: mul_sext:
177; AVX512:       # %bb.0: # %entry
178; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
179; AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
180; AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
181; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
182; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
183; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
184; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
185; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
186; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
187; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
188; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
189; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
190; AVX512-NEXT:    vmovd %xmm0, %eax
191; AVX512-NEXT:    addl %edx, %eax
192; AVX512-NEXT:    vzeroupper
193; AVX512-NEXT:    retq
194entry:
195  %0 = load <16 x i8>, ptr %a, align 16
196  %1 = zext <16 x i8> %0 to <16 x i16>
197  %2 = load <16 x i8>, ptr %b, align 16
198  %3 = sext <16 x i8> %2 to <16 x i16>
199  %4 = mul nsw <16 x i16> %3, %1
200  ; TODO:
201  ; We also need to verify that the multiply has at least 2x the number of bits
202  ; of the input. We shouldn't match
203  ; (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
204  %5 = sext <16 x i16> %4 to <16 x i32>
205  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
206  %op.extra = add nsw i32 %6, %c
207  ret i32 %op.extra
208}
209
210define i32 @vpdpbusd_512(ptr%a, ptr%b, i32 %c, i32 %n) {
211; AVXVNNI-LABEL: vpdpbusd_512:
212; AVXVNNI:       # %bb.0: # %entry
213; AVXVNNI-NEXT:    vmovdqa (%rdi), %xmm0
214; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
215; AVXVNNI-NEXT:    {vex} vpdpbusd (%rsi), %xmm0, %xmm1
216; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
217; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
218; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
219; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
220; AVXVNNI-NEXT:    vmovd %xmm0, %eax
221; AVXVNNI-NEXT:    addl %edx, %eax
222; AVXVNNI-NEXT:    retq
223;
224; AVX512VNNI-LABEL: vpdpbusd_512:
225; AVX512VNNI:       # %bb.0: # %entry
226; AVX512VNNI-NEXT:    vmovdqa (%rdi), %xmm0
227; AVX512VNNI-NEXT:    vmovdqa (%rsi), %xmm1
228; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
229; AVX512VNNI-NEXT:    vpdpbusd %zmm1, %zmm0, %zmm2
230; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
231; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
232; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
233; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
234; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
235; AVX512VNNI-NEXT:    addl %edx, %eax
236; AVX512VNNI-NEXT:    vzeroupper
237; AVX512VNNI-NEXT:    retq
238;
239; AVX512VLVNNI-LABEL: vpdpbusd_512:
240; AVX512VLVNNI:       # %bb.0: # %entry
241; AVX512VLVNNI-NEXT:    vmovdqa (%rdi), %xmm0
242; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
243; AVX512VLVNNI-NEXT:    vpdpbusd (%rsi), %xmm0, %xmm1
244; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
245; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
246; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
247; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
248; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
249; AVX512VLVNNI-NEXT:    addl %edx, %eax
250; AVX512VLVNNI-NEXT:    retq
251entry:
252  %0 = load <16 x i8>, ptr %a, align 16
253  %1 = zext <16 x i8> %0 to <16 x i32>
254  %2 = load <16 x i8>, ptr %b, align 16
255  %3 = sext <16 x i8> %2 to <16 x i32>
256  %4 = mul nsw <16 x i32> %3, %1
257  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
258  %op.extra = add nsw i32 %5, %c
259  ret i32 %op.extra
260}
261
262declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
263
264define i32 @vpdpbusd_256(ptr%a, ptr%b, i32 %c, i32 %n) {
265; AVXVNNI-LABEL: vpdpbusd_256:
266; AVXVNNI:       # %bb.0: # %entry
267; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
268; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
269; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
270; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm1, %xmm2
271; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
272; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
273; AVXVNNI-NEXT:    vmovd %xmm0, %eax
274; AVXVNNI-NEXT:    addl %edx, %eax
275; AVXVNNI-NEXT:    retq
276;
277; AVX512VNNI-LABEL: vpdpbusd_256:
278; AVX512VNNI:       # %bb.0: # %entry
279; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
280; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
281; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
282; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
283; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
284; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
285; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
286; AVX512VNNI-NEXT:    addl %edx, %eax
287; AVX512VNNI-NEXT:    vzeroupper
288; AVX512VNNI-NEXT:    retq
289;
290; AVX512VLVNNI-LABEL: vpdpbusd_256:
291; AVX512VLVNNI:       # %bb.0: # %entry
292; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
293; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
294; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
295; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
296; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
297; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
298; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
299; AVX512VLVNNI-NEXT:    addl %edx, %eax
300; AVX512VLVNNI-NEXT:    retq
301entry:
302  %0 = load <8 x i8>, ptr %a, align 8
303  %1 = zext <8 x i8> %0 to <8 x i32>
304  %2 = load <8 x i8>, ptr %b, align 8
305  %3 = sext <8 x i8> %2 to <8 x i32>
306  %4 = mul nsw <8 x i32> %3, %1
307  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
308  %op.extra = add nsw i32 %5, %c
309  ret i32 %op.extra
310}
311
312declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
313
314define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
315; AVXVNNI-LABEL: vpdpbusd_128:
316; AVXVNNI:       # %bb.0: # %entry
317; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
318; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
319; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
320; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
321; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
322; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
323; AVXVNNI-NEXT:    vmovd %xmm2, %eax
324; AVXVNNI-NEXT:    addl %edx, %eax
325; AVXVNNI-NEXT:    retq
326;
327; AVX512VNNI-LABEL: vpdpbusd_128:
328; AVX512VNNI:       # %bb.0: # %entry
329; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
330; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
331; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
332; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
333; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
334; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
335; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
336; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
337; AVX512VNNI-NEXT:    addl %edx, %eax
338; AVX512VNNI-NEXT:    vzeroupper
339; AVX512VNNI-NEXT:    retq
340;
341; AVX512VLVNNI-LABEL: vpdpbusd_128:
342; AVX512VLVNNI:       # %bb.0: # %entry
343; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
344; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
345; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
346; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
347; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
348; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
349; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
350; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
351; AVX512VLVNNI-NEXT:    addl %edx, %eax
352; AVX512VLVNNI-NEXT:    retq
353entry:
354  %0 = load <4 x i8>, ptr %a, align 8
355  %1 = zext <4 x i8> %0 to <4 x i32>
356  %2 = load <4 x i8>, ptr %b, align 8
357  %3 = sext <4 x i8> %2 to <4 x i32>
358  %4 = mul nsw <4 x i32> %3, %1
359  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
360  %op.extra = add nsw i32 %5, %c
361  ret i32 %op.extra
362}
363
364declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
365
366define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
367; AVXVNNI-LABEL: vpdpbusd_2xi32:
368; AVXVNNI:       # %bb.0: # %entry
369; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
370; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
371; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
372; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
373; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
374; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
375; AVXVNNI-NEXT:    vmovd %xmm2, %eax
376; AVXVNNI-NEXT:    addl %edx, %eax
377; AVXVNNI-NEXT:    retq
378;
379; AVX512VNNI-LABEL: vpdpbusd_2xi32:
380; AVX512VNNI:       # %bb.0: # %entry
381; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
382; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
383; AVX512VNNI-NEXT:    vpandq %zmm1, %zmm0, %zmm0
384; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
385; AVX512VNNI-NEXT:    vpandq %zmm1, %zmm2, %zmm1
386; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
387; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
388; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
389; AVX512VNNI-NEXT:    addl %edx, %eax
390; AVX512VNNI-NEXT:    vzeroupper
391; AVX512VNNI-NEXT:    retq
392;
393; AVX512VLVNNI-LABEL: vpdpbusd_2xi32:
394; AVX512VLVNNI:       # %bb.0: # %entry
395; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
396; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
397; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
398; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
399; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
400; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
401; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
402; AVX512VLVNNI-NEXT:    addl %edx, %eax
403; AVX512VLVNNI-NEXT:    retq
404entry:
405  %0 = load <2 x i8>, ptr %a, align 8
406  %1 = zext <2 x i8> %0 to <2 x i32>
407  %2 = load <2 x i8>, ptr %b, align 8
408  %3 = sext <2 x i8> %2 to <2 x i32>
409  %4 = mul nsw <2 x i32> %3, %1
410  %5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %4)
411  %op.extra = add nsw i32 %5, %c
412  ret i32 %op.extra
413}
414
415declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
416
417define i32 @vpdpbusd_32xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
418; AVXVNNI-LABEL: vpdpbusd_32xi32:
419; AVXVNNI:       # %bb.0: # %entry
420; AVXVNNI-NEXT:    vmovdqu (%rdi), %ymm0
421; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
422; AVXVNNI-NEXT:    {vex} vpdpbusd (%rsi), %ymm0, %ymm1
423; AVXVNNI-NEXT:    vextracti128 $1, %ymm1, %xmm0
424; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
425; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
426; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
427; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
428; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
429; AVXVNNI-NEXT:    vmovd %xmm0, %eax
430; AVXVNNI-NEXT:    addl %edx, %eax
431; AVXVNNI-NEXT:    vzeroupper
432; AVXVNNI-NEXT:    retq
433;
434; AVX512VNNI-LABEL: vpdpbusd_32xi32:
435; AVX512VNNI:       # %bb.0: # %entry
436; AVX512VNNI-NEXT:    vmovdqu (%rdi), %ymm0
437; AVX512VNNI-NEXT:    vmovdqu (%rsi), %ymm1
438; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
439; AVX512VNNI-NEXT:    vpdpbusd %zmm1, %zmm0, %zmm2
440; AVX512VNNI-NEXT:    vextracti128 $1, %ymm2, %xmm0
441; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
442; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
443; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
444; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
445; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
446; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
447; AVX512VNNI-NEXT:    addl %edx, %eax
448; AVX512VNNI-NEXT:    vzeroupper
449; AVX512VNNI-NEXT:    retq
450;
451; AVX512VLVNNI-LABEL: vpdpbusd_32xi32:
452; AVX512VLVNNI:       # %bb.0: # %entry
453; AVX512VLVNNI-NEXT:    vmovdqu (%rdi), %ymm0
454; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
455; AVX512VLVNNI-NEXT:    vpdpbusd (%rsi), %ymm0, %ymm1
456; AVX512VLVNNI-NEXT:    vextracti128 $1, %ymm1, %xmm0
457; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
458; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
459; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
460; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
461; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
462; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
463; AVX512VLVNNI-NEXT:    addl %edx, %eax
464; AVX512VLVNNI-NEXT:    vzeroupper
465; AVX512VLVNNI-NEXT:    retq
466entry:
467  %0 = load <32 x i8>, ptr %a, align 16
468  %1 = zext <32 x i8> %0 to <32 x i32>
469  %2 = load <32 x i8>, ptr %b, align 16
470  %3 = sext <32 x i8> %2 to <32 x i32>
471  %4 = mul nsw <32 x i32> %3, %1
472  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
473  %op.extra = add nsw i32 %5, %c
474  ret i32 %op.extra
475}
476
477declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
478
479define i32 @vpdpbusd_64xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
480; AVXVNNI-LABEL: vpdpbusd_64xi32:
481; AVXVNNI:       # %bb.0: # %entry
482; AVXVNNI-NEXT:    vmovdqu (%rdi), %ymm0
483; AVXVNNI-NEXT:    vmovdqu 32(%rdi), %ymm1
484; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
485; AVXVNNI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
486; AVXVNNI-NEXT:    {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3
487; AVXVNNI-NEXT:    {vex} vpdpbusd (%rsi), %ymm0, %ymm2
488; AVXVNNI-NEXT:    vpaddd %ymm3, %ymm2, %ymm0
489; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
490; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
491; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
492; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
493; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
494; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
495; AVXVNNI-NEXT:    vmovd %xmm0, %eax
496; AVXVNNI-NEXT:    addl %edx, %eax
497; AVXVNNI-NEXT:    vzeroupper
498; AVXVNNI-NEXT:    retq
499;
500; AVX512-LABEL: vpdpbusd_64xi32:
501; AVX512:       # %bb.0: # %entry
502; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
503; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
504; AVX512-NEXT:    vpdpbusd (%rsi), %zmm0, %zmm1
505; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
506; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
507; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
508; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
509; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
510; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
511; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
512; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
513; AVX512-NEXT:    vmovd %xmm0, %eax
514; AVX512-NEXT:    addl %edx, %eax
515; AVX512-NEXT:    vzeroupper
516; AVX512-NEXT:    retq
517entry:
518  %0 = load <64 x i8>, ptr %a, align 16
519  %1 = zext <64 x i8> %0 to <64 x i32>
520  %2 = load <64 x i8>, ptr %b, align 16
521  %3 = sext <64 x i8> %2 to <64 x i32>
522  %4 = mul nsw <64 x i32> %3, %1
523  %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
524  %op.extra = add nsw i32 %5, %c
525  ret i32 %op.extra
526}
527
528declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
529