xref: /llvm-project/llvm/test/CodeGen/X86/avx512bwvl-arith.ll (revision 2951dba98beb97a73da3443dcdb2eb09069e1aca)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX512
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX256
4
5; 256-bit
6
7define <32 x i8> @vpaddb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
8; CHECK-LABEL: vpaddb256_test:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
11; CHECK-NEXT:    retq
12  %x = add <32 x i8> %i, %j
13  ret <32 x i8> %x
14}
15
16define <32 x i8> @vpaddb256_fold_test(<32 x i8> %i, ptr %j) nounwind {
17; CHECK-LABEL: vpaddb256_fold_test:
18; CHECK:       # %bb.0:
19; CHECK-NEXT:    vpaddb (%rdi), %ymm0, %ymm0
20; CHECK-NEXT:    retq
21  %tmp = load <32 x i8>, ptr %j, align 4
22  %x = add <32 x i8> %i, %tmp
23  ret <32 x i8> %x
24}
25
26define <16 x i16> @vpaddw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
27; CHECK-LABEL: vpaddw256_test:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
30; CHECK-NEXT:    retq
31  %x = add <16 x i16> %i, %j
32  ret <16 x i16> %x
33}
34
35define <16 x i16> @vpaddw256_fold_test(<16 x i16> %i, ptr %j) nounwind {
36; CHECK-LABEL: vpaddw256_fold_test:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vpaddw (%rdi), %ymm0, %ymm0
39; CHECK-NEXT:    retq
40  %tmp = load <16 x i16>, ptr %j, align 4
41  %x = add <16 x i16> %i, %tmp
42  ret <16 x i16> %x
43}
44
45define <16 x i16> @vpaddw256_mask_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone {
46; CHECK-LABEL: vpaddw256_mask_test:
47; CHECK:       # %bb.0:
48; CHECK-NEXT:    vptestmw %ymm2, %ymm2, %k1
49; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 {%k1}
50; CHECK-NEXT:    retq
51  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
52  %x = add <16 x i16> %i, %j
53  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i
54  ret <16 x i16> %r
55}
56
57define <16 x i16> @vpaddw256_maskz_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone {
58; CHECK-LABEL: vpaddw256_maskz_test:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vptestmw %ymm2, %ymm2, %k1
61; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z}
62; CHECK-NEXT:    retq
63  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
64  %x = add <16 x i16> %i, %j
65  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
66  ret <16 x i16> %r
67}
68
69define <16 x i16> @vpaddw256_mask_fold_test(<16 x i16> %i, ptr %j.ptr, <16 x i16> %mask1) nounwind readnone {
70; CHECK-LABEL: vpaddw256_mask_fold_test:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    vptestmw %ymm1, %ymm1, %k1
73; CHECK-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 {%k1}
74; CHECK-NEXT:    retq
75  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
76  %j = load <16 x i16>, ptr %j.ptr
77  %x = add <16 x i16> %i, %j
78  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i
79  ret <16 x i16> %r
80}
81
82define <16 x i16> @vpaddw256_maskz_fold_test(<16 x i16> %i, ptr %j.ptr, <16 x i16> %mask1) nounwind readnone {
83; CHECK-LABEL: vpaddw256_maskz_fold_test:
84; CHECK:       # %bb.0:
85; CHECK-NEXT:    vptestmw %ymm1, %ymm1, %k1
86; CHECK-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z}
87; CHECK-NEXT:    retq
88  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
89  %j = load <16 x i16>, ptr %j.ptr
90  %x = add <16 x i16> %i, %j
91  %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
92  ret <16 x i16> %r
93}
94
95define <32 x i8> @vpsubb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
96; CHECK-LABEL: vpsubb256_test:
97; CHECK:       # %bb.0:
98; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
99; CHECK-NEXT:    retq
100  %x = sub <32 x i8> %i, %j
101  ret <32 x i8> %x
102}
103
104define <16 x i16> @vpsubw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
105; CHECK-LABEL: vpsubw256_test:
106; CHECK:       # %bb.0:
107; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
108; CHECK-NEXT:    retq
109  %x = sub <16 x i16> %i, %j
110  ret <16 x i16> %x
111}
112
113define <16 x i16> @vpmullw256_test(<16 x i16> %i, <16 x i16> %j) {
114; CHECK-LABEL: vpmullw256_test:
115; CHECK:       # %bb.0:
116; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
117; CHECK-NEXT:    retq
118  %x = mul <16 x i16> %i, %j
119  ret <16 x i16> %x
120}
121
122; 128-bit
123
124define <16 x i8> @vpaddb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
125; CHECK-LABEL: vpaddb128_test:
126; CHECK:       # %bb.0:
127; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
128; CHECK-NEXT:    retq
129  %x = add <16 x i8> %i, %j
130  ret <16 x i8> %x
131}
132
133define <16 x i8> @vpaddb128_fold_test(<16 x i8> %i, ptr %j) nounwind {
134; CHECK-LABEL: vpaddb128_fold_test:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    vpaddb (%rdi), %xmm0, %xmm0
137; CHECK-NEXT:    retq
138  %tmp = load <16 x i8>, ptr %j, align 4
139  %x = add <16 x i8> %i, %tmp
140  ret <16 x i8> %x
141}
142
143define <8 x i16> @vpaddw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
144; CHECK-LABEL: vpaddw128_test:
145; CHECK:       # %bb.0:
146; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
147; CHECK-NEXT:    retq
148  %x = add <8 x i16> %i, %j
149  ret <8 x i16> %x
150}
151
152define <8 x i16> @vpaddw128_fold_test(<8 x i16> %i, ptr %j) nounwind {
153; CHECK-LABEL: vpaddw128_fold_test:
154; CHECK:       # %bb.0:
155; CHECK-NEXT:    vpaddw (%rdi), %xmm0, %xmm0
156; CHECK-NEXT:    retq
157  %tmp = load <8 x i16>, ptr %j, align 4
158  %x = add <8 x i16> %i, %tmp
159  ret <8 x i16> %x
160}
161
162define <8 x i16> @vpaddw128_mask_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone {
163; CHECK-LABEL: vpaddw128_mask_test:
164; CHECK:       # %bb.0:
165; CHECK-NEXT:    vptestmw %xmm2, %xmm2, %k1
166; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 {%k1}
167; CHECK-NEXT:    retq
168  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
169  %x = add <8 x i16> %i, %j
170  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i
171  ret <8 x i16> %r
172}
173
174define <8 x i16> @vpaddw128_maskz_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone {
175; CHECK-LABEL: vpaddw128_maskz_test:
176; CHECK:       # %bb.0:
177; CHECK-NEXT:    vptestmw %xmm2, %xmm2, %k1
178; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z}
179; CHECK-NEXT:    retq
180  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
181  %x = add <8 x i16> %i, %j
182  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
183  ret <8 x i16> %r
184}
185
186define <8 x i16> @vpaddw128_mask_fold_test(<8 x i16> %i, ptr %j.ptr, <8 x i16> %mask1) nounwind readnone {
187; CHECK-LABEL: vpaddw128_mask_fold_test:
188; CHECK:       # %bb.0:
189; CHECK-NEXT:    vptestmw %xmm1, %xmm1, %k1
190; CHECK-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 {%k1}
191; CHECK-NEXT:    retq
192  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
193  %j = load <8 x i16>, ptr %j.ptr
194  %x = add <8 x i16> %i, %j
195  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i
196  ret <8 x i16> %r
197}
198
199define <8 x i16> @vpaddw128_maskz_fold_test(<8 x i16> %i, ptr %j.ptr, <8 x i16> %mask1) nounwind readnone {
200; CHECK-LABEL: vpaddw128_maskz_fold_test:
201; CHECK:       # %bb.0:
202; CHECK-NEXT:    vptestmw %xmm1, %xmm1, %k1
203; CHECK-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z}
204; CHECK-NEXT:    retq
205  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
206  %j = load <8 x i16>, ptr %j.ptr
207  %x = add <8 x i16> %i, %j
208  %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
209  ret <8 x i16> %r
210}
211
212define <16 x i8> @vpsubb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
213; CHECK-LABEL: vpsubb128_test:
214; CHECK:       # %bb.0:
215; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
216; CHECK-NEXT:    retq
217  %x = sub <16 x i8> %i, %j
218  ret <16 x i8> %x
219}
220
221define <8 x i16> @vpsubw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone {
222; CHECK-LABEL: vpsubw128_test:
223; CHECK:       # %bb.0:
224; CHECK-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
225; CHECK-NEXT:    retq
226  %x = sub <8 x i16> %i, %j
227  ret <8 x i16> %x
228}
229
230define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
231; CHECK-LABEL: vpmullw128_test:
232; CHECK:       # %bb.0:
233; CHECK-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
234; CHECK-NEXT:    retq
235  %x = mul <8 x i16> %i, %j
236  ret <8 x i16> %x
237}
238
239define i16 @PR90356(<16 x i1> %a) {
240; EVEX512-LABEL: PR90356:
241; EVEX512:       # %bb.0:
242; EVEX512-NEXT:    vpsllw $7, %xmm0, %xmm0
243; EVEX512-NEXT:    vpmovb2m %xmm0, %k1
244; EVEX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
245; EVEX512-NEXT:    movb $63, %al
246; EVEX512-NEXT:    kmovd %eax, %k1
247; EVEX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
248; EVEX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
249; EVEX512-NEXT:    kmovd %k0, %eax
250; EVEX512-NEXT:    # kill: def $ax killed $ax killed $eax
251; EVEX512-NEXT:    vzeroupper
252; EVEX512-NEXT:    retq
253;
254; EVEX256-LABEL: PR90356:
255; EVEX256:       # %bb.0:
256; EVEX256-NEXT:    vpsllw $7, %xmm0, %xmm0
257; EVEX256-NEXT:    vpmovb2m %xmm0, %k0
258; EVEX256-NEXT:    vpmovm2w %k0, %ymm0
259; EVEX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
260; EVEX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
261; EVEX256-NEXT:    vpmovw2m %ymm0, %k0
262; EVEX256-NEXT:    kmovd %k0, %eax
263; EVEX256-NEXT:    # kill: def $ax killed $ax killed $eax
264; EVEX256-NEXT:    vzeroupper
265; EVEX256-NEXT:    retq
266  %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
267  %2 = bitcast <16 x i1> %1 to i16
268  ret i16 %2
269}
270