xref: /llvm-project/llvm/test/CodeGen/X86/avxvnni.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefix=AVX
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avx512bw,+avxvnni | FileCheck %s --check-prefix=AVX
5
6define <4 x i32> @test_pmaddwd_v8i16_add_v4i32(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
7; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32:
8; AVX:       # %bb.0:
9; AVX-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0
10; AVX-NEXT:    retq
11;
12; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32:
13; AVX512:       # %bb.0:
14; AVX512-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0
15; AVX512-NEXT:    retq
16  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
17  %2 = add <4 x i32> %1, %a0
18  ret <4 x i32> %2
19}
20
21define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
22; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_commute:
23; AVX:       # %bb.0:
24; AVX-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0
25; AVX-NEXT:    retq
26;
27; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_commute:
28; AVX512:       # %bb.0:
29; AVX512-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0
30; AVX512-NEXT:    retq
31  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
32  %2 = add <4 x i32> %a0, %1
33  ret <4 x i32> %2
34}
35
36define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load1(<4 x i32> %a0, ptr %p1, <8 x i16> %a2) {
37; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_load1:
38; AVX:       # %bb.0:
39; AVX-NEXT:    {vex} vpdpwssd (%rdi), %xmm1, %xmm0
40; AVX-NEXT:    retq
41;
42; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_load1:
43; AVX512:       # %bb.0:
44; AVX512-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
45; AVX512-NEXT:    retq
46  %a1 = load <8 x i16>, ptr %p1
47  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
48  %2 = add <4 x i32> %1, %a0
49  ret <4 x i32> %2
50}
51
52define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load2(<4 x i32> %a0, <8 x i16> %a1, ptr %p2) {
53; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_load2:
54; AVX:       # %bb.0:
55; AVX-NEXT:    {vex} vpdpwssd (%rdi), %xmm1, %xmm0
56; AVX-NEXT:    retq
57;
58; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_load2:
59; AVX512:       # %bb.0:
60; AVX512-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
61; AVX512-NEXT:    retq
62  %a2 = load <8 x i16>, ptr %p2
63  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
64  %2 = add <4 x i32> %1, %a0
65  ret <4 x i32> %2
66}
67
68define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load1(<4 x i32> %a0, ptr %p1, <8 x i16> %a2) {
69; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load1:
70; AVX:       # %bb.0:
71; AVX-NEXT:    {vex} vpdpwssd (%rdi), %xmm1, %xmm0
72; AVX-NEXT:    retq
73;
74; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load1:
75; AVX512:       # %bb.0:
76; AVX512-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
77; AVX512-NEXT:    retq
78  %a1 = load <8 x i16>, ptr %p1
79  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
80  %2 = add <4 x i32> %a0, %1
81  ret <4 x i32> %2
82}
83
84define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load2(<4 x i32> %a0, <8 x i16> %a1, ptr %p2) {
85; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load2:
86; AVX:       # %bb.0:
87; AVX-NEXT:    {vex} vpdpwssd (%rdi), %xmm1, %xmm0
88; AVX-NEXT:    retq
89;
90; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load2:
91; AVX512:       # %bb.0:
92; AVX512-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
93; AVX512-NEXT:    retq
94  %a2 = load <8 x i16>, ptr %p2
95  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
96  %2 = add <4 x i32> %a0, %1
97  ret <4 x i32> %2
98}
99
100define <8 x i32> @test_pmaddwd_v16i16_add_v8i32(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
101; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32:
102; AVX:       # %bb.0:
103; AVX-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0
104; AVX-NEXT:    retq
105;
106; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32:
107; AVX512:       # %bb.0:
108; AVX512-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0
109; AVX512-NEXT:    retq
110  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
111  %2 = add <8 x i32> %1, %a0
112  ret <8 x i32> %2
113}
114
115define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
116; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_commute:
117; AVX:       # %bb.0:
118; AVX-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0
119; AVX-NEXT:    retq
120;
121; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_commute:
122; AVX512:       # %bb.0:
123; AVX512-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0
124; AVX512-NEXT:    retq
125  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
126  %2 = add <8 x i32> %a0, %1
127  ret <8 x i32> %2
128}
129
130define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load1(<8 x i32> %a0, ptr %p1, <16 x i16> %a2) {
131; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_load1:
132; AVX:       # %bb.0:
133; AVX-NEXT:    {vex} vpdpwssd (%rdi), %ymm1, %ymm0
134; AVX-NEXT:    retq
135;
136; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_load1:
137; AVX512:       # %bb.0:
138; AVX512-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
139; AVX512-NEXT:    retq
140  %a1 = load <16 x i16>, ptr %p1
141  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
142  %2 = add <8 x i32> %1, %a0
143  ret <8 x i32> %2
144}
145
146define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load2(<8 x i32> %a0, <16 x i16> %a1, ptr %p2) {
147; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_load2:
148; AVX:       # %bb.0:
149; AVX-NEXT:    {vex} vpdpwssd (%rdi), %ymm1, %ymm0
150; AVX-NEXT:    retq
151;
152; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_load2:
153; AVX512:       # %bb.0:
154; AVX512-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
155; AVX512-NEXT:    retq
156  %a2 = load <16 x i16>, ptr %p2
157  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
158  %2 = add <8 x i32> %1, %a0
159  ret <8 x i32> %2
160}
161
162define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load1(<8 x i32> %a0, ptr %p1, <16 x i16> %a2) {
163; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load1:
164; AVX:       # %bb.0:
165; AVX-NEXT:    {vex} vpdpwssd (%rdi), %ymm1, %ymm0
166; AVX-NEXT:    retq
167;
168; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load1:
169; AVX512:       # %bb.0:
170; AVX512-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
171; AVX512-NEXT:    retq
172  %a1 = load <16 x i16>, ptr %p1
173  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
174  %2 = add <8 x i32> %a0, %1
175  ret <8 x i32> %2
176}
177
178define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load2(<8 x i32> %a0, <16 x i16> %a1, ptr %p2) {
179; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load2:
180; AVX:       # %bb.0:
181; AVX-NEXT:    {vex} vpdpwssd (%rdi), %ymm1, %ymm0
182; AVX-NEXT:    retq
183;
184; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load2:
185; AVX512:       # %bb.0:
186; AVX512-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
187; AVX512-NEXT:    retq
188  %a2 = load <16 x i16>, ptr %p2
189  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
190  %2 = add <8 x i32> %a0, %1
191  ret <8 x i32> %2
192}
193
194declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
195declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
196