xref: /llvm-project/llvm/test/CodeGen/X86/abds-vector-512.ll (revision da570ef1b4f856603970ecb14299947fb6cd678a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
4
5;
6; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b)
7;
8
9define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
10; AVX512BW-LABEL: abd_ext_v64i8:
11; AVX512BW:       # %bb.0:
12; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
13; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
14; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
15; AVX512BW-NEXT:    retq
16;
17; AVX512DQ-LABEL: abd_ext_v64i8:
18; AVX512DQ:       # %bb.0:
19; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
20; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
21; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
22; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
23; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
24; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
25; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
26; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
27; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
28; AVX512DQ-NEXT:    retq
29  %aext = sext <64 x i8> %a to <64 x i64>
30  %bext = sext <64 x i8> %b to <64 x i64>
31  %sub = sub <64 x i64> %aext, %bext
32  %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 false)
33  %trunc = trunc <64 x i64> %abs to <64 x i8>
34  ret <64 x i8> %trunc
35}
36
37define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind {
38; AVX512BW-LABEL: abd_ext_v64i8_undef:
39; AVX512BW:       # %bb.0:
40; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
41; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
42; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
43; AVX512BW-NEXT:    retq
44;
45; AVX512DQ-LABEL: abd_ext_v64i8_undef:
46; AVX512DQ:       # %bb.0:
47; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
48; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
49; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
50; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
51; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
52; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
53; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
54; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
55; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
56; AVX512DQ-NEXT:    retq
57  %aext = sext <64 x i8> %a to <64 x i64>
58  %bext = sext <64 x i8> %b to <64 x i64>
59  %sub = sub <64 x i64> %aext, %bext
60  %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 true)
61  %trunc = trunc <64 x i64> %abs to <64 x i8>
62  ret <64 x i8> %trunc
63}
64
65define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
66; AVX512BW-LABEL: abd_ext_v32i16:
67; AVX512BW:       # %bb.0:
68; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
69; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
70; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
71; AVX512BW-NEXT:    retq
72;
73; AVX512DQ-LABEL: abd_ext_v32i16:
74; AVX512DQ:       # %bb.0:
75; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
76; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
77; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
78; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
79; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
80; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
81; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
82; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
83; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
84; AVX512DQ-NEXT:    retq
85  %aext = sext <32 x i16> %a to <32 x i64>
86  %bext = sext <32 x i16> %b to <32 x i64>
87  %sub = sub <32 x i64> %aext, %bext
88  %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false)
89  %trunc = trunc <32 x i64> %abs to <32 x i16>
90  ret <32 x i16> %trunc
91}
92
93define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind {
94; AVX512BW-LABEL: abd_ext_v32i16_undef:
95; AVX512BW:       # %bb.0:
96; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
97; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
98; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
99; AVX512BW-NEXT:    retq
100;
101; AVX512DQ-LABEL: abd_ext_v32i16_undef:
102; AVX512DQ:       # %bb.0:
103; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
104; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
105; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
106; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
107; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
108; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
109; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
110; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
111; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
112; AVX512DQ-NEXT:    retq
113  %aext = sext <32 x i16> %a to <32 x i64>
114  %bext = sext <32 x i16> %b to <32 x i64>
115  %sub = sub <32 x i64> %aext, %bext
116  %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true)
117  %trunc = trunc <32 x i64> %abs to <32 x i16>
118  ret <32 x i16> %trunc
119}
120
121define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
122; AVX512-LABEL: abd_ext_v16i32:
123; AVX512:       # %bb.0:
124; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
125; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
126; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
127; AVX512-NEXT:    retq
128  %aext = sext <16 x i32> %a to <16 x i64>
129  %bext = sext <16 x i32> %b to <16 x i64>
130  %sub = sub <16 x i64> %aext, %bext
131  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false)
132  %trunc = trunc <16 x i64> %abs to <16 x i32>
133  ret <16 x i32> %trunc
134}
135
136define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind {
137; AVX512-LABEL: abd_ext_v16i32_undef:
138; AVX512:       # %bb.0:
139; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
140; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
141; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
142; AVX512-NEXT:    retq
143  %aext = sext <16 x i32> %a to <16 x i64>
144  %bext = sext <16 x i32> %b to <16 x i64>
145  %sub = sub <16 x i64> %aext, %bext
146  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true)
147  %trunc = trunc <16 x i64> %abs to <16 x i32>
148  ret <16 x i32> %trunc
149}
150
151define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
152; AVX512-LABEL: abd_ext_v8i64:
153; AVX512:       # %bb.0:
154; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
155; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
156; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
157; AVX512-NEXT:    retq
158  %aext = sext <8 x i64> %a to <8 x i128>
159  %bext = sext <8 x i64> %b to <8 x i128>
160  %sub = sub <8 x i128> %aext, %bext
161  %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 false)
162  %trunc = trunc <8 x i128> %abs to <8 x i64>
163  ret <8 x i64> %trunc
164}
165
166define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind {
167; AVX512-LABEL: abd_ext_v8i64_undef:
168; AVX512:       # %bb.0:
169; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
170; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
171; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
172; AVX512-NEXT:    retq
173  %aext = sext <8 x i64> %a to <8 x i128>
174  %bext = sext <8 x i64> %b to <8 x i128>
175  %sub = sub <8 x i128> %aext, %bext
176  %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 true)
177  %trunc = trunc <8 x i128> %abs to <8 x i64>
178  ret <8 x i64> %trunc
179}
180
181;
182; sub(smax(a,b),smin(a,b)) -> abds(a,b)
183;
184
185define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
186; AVX512BW-LABEL: abd_minmax_v64i8:
187; AVX512BW:       # %bb.0:
188; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
189; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
190; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
191; AVX512BW-NEXT:    retq
192;
193; AVX512DQ-LABEL: abd_minmax_v64i8:
194; AVX512DQ:       # %bb.0:
195; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
196; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
197; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
198; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
199; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
200; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
201; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
202; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
203; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
204; AVX512DQ-NEXT:    retq
205  %min = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %a, <64 x i8> %b)
206  %max = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %a, <64 x i8> %b)
207  %sub = sub <64 x i8> %max, %min
208  ret <64 x i8> %sub
209}
210
211define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
212; AVX512BW-LABEL: abd_minmax_v32i16:
213; AVX512BW:       # %bb.0:
214; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
215; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
216; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
217; AVX512BW-NEXT:    retq
218;
219; AVX512DQ-LABEL: abd_minmax_v32i16:
220; AVX512DQ:       # %bb.0:
221; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
222; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
223; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
224; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
225; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
226; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
227; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
228; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
229; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
230; AVX512DQ-NEXT:    retq
231  %min = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %a, <32 x i16> %b)
232  %max = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %a, <32 x i16> %b)
233  %sub = sub <32 x i16> %max, %min
234  ret <32 x i16> %sub
235}
236
237define <16 x i32> @abd_minmax_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
238; AVX512-LABEL: abd_minmax_v16i32:
239; AVX512:       # %bb.0:
240; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
241; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
242; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
243; AVX512-NEXT:    retq
244  %min = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %a, <16 x i32> %b)
245  %max = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %a, <16 x i32> %b)
246  %sub = sub <16 x i32> %max, %min
247  ret <16 x i32> %sub
248}
249
250define <8 x i64> @abd_minmax_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
251; AVX512-LABEL: abd_minmax_v8i64:
252; AVX512:       # %bb.0:
253; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
254; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
255; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
256; AVX512-NEXT:    retq
257  %min = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %a, <8 x i64> %b)
258  %max = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %a, <8 x i64> %b)
259  %sub = sub <8 x i64> %max, %min
260  ret <8 x i64> %sub
261}
262
263;
264; select(icmp(a,b),sub(a,b),sub(b,a)) -> abds(a,b)
265;
266
267define <64 x i8> @abd_cmp_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
268; AVX512BW-LABEL: abd_cmp_v64i8:
269; AVX512BW:       # %bb.0:
270; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
271; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
272; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
273; AVX512BW-NEXT:    retq
274;
275; AVX512DQ-LABEL: abd_cmp_v64i8:
276; AVX512DQ:       # %bb.0:
277; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
278; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
279; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
280; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
281; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
282; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
283; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
284; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
285; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
286; AVX512DQ-NEXT:    retq
287  %cmp = icmp sgt <64 x i8> %a, %b
288  %ab = sub <64 x i8> %a, %b
289  %ba = sub <64 x i8> %b, %a
290  %sel = select <64 x i1> %cmp, <64 x i8> %ab, <64 x i8> %ba
291  ret <64 x i8> %sel
292}
293
294define <32 x i16> @abd_cmp_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
295; AVX512BW-LABEL: abd_cmp_v32i16:
296; AVX512BW:       # %bb.0:
297; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
298; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
299; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
300; AVX512BW-NEXT:    retq
301;
302; AVX512DQ-LABEL: abd_cmp_v32i16:
303; AVX512DQ:       # %bb.0:
304; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
305; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
306; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
307; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
308; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
309; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
310; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
311; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
312; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
313; AVX512DQ-NEXT:    retq
314  %cmp = icmp sge <32 x i16> %a, %b
315  %ab = sub <32 x i16> %a, %b
316  %ba = sub <32 x i16> %b, %a
317  %sel = select <32 x i1> %cmp, <32 x i16> %ab, <32 x i16> %ba
318  ret <32 x i16> %sel
319}
320
321define <16 x i32> @abd_cmp_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
322; AVX512-LABEL: abd_cmp_v16i32:
323; AVX512:       # %bb.0:
324; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
325; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
326; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
327; AVX512-NEXT:    retq
328  %cmp = icmp slt <16 x i32> %a, %b
329  %ab = sub <16 x i32> %a, %b
330  %ba = sub <16 x i32> %b, %a
331  %sel = select <16 x i1> %cmp, <16 x i32> %ba, <16 x i32> %ab
332  ret <16 x i32> %sel
333}
334
335define <8 x i64> @abd_cmp_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
336; AVX512-LABEL: abd_cmp_v8i64:
337; AVX512:       # %bb.0:
338; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
339; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
340; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
341; AVX512-NEXT:    retq
342  %cmp = icmp sge <8 x i64> %a, %b
343  %ab = sub <8 x i64> %a, %b
344  %ba = sub <8 x i64> %b, %a
345  %sel = select <8 x i1> %cmp, <8 x i64> %ab, <8 x i64> %ba
346  ret <8 x i64> %sel
347}
348
349;
350; abs(sub_nsw(x, y)) -> abds(a,b)
351;
352
353define <64 x i8> @abd_subnsw_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
354; AVX512BW-LABEL: abd_subnsw_v64i8:
355; AVX512BW:       # %bb.0:
356; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
357; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
358; AVX512BW-NEXT:    retq
359;
360; AVX512DQ-LABEL: abd_subnsw_v64i8:
361; AVX512DQ:       # %bb.0:
362; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
363; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
364; AVX512DQ-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
365; AVX512DQ-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
366; AVX512DQ-NEXT:    vpabsb %ymm0, %ymm0
367; AVX512DQ-NEXT:    vpabsb %ymm2, %ymm1
368; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
369; AVX512DQ-NEXT:    retq
370  %sub = sub nsw <64 x i8> %a, %b
371  %abs = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %sub, i1 false)
372  ret <64 x i8> %abs
373}
374
375define <32 x i16> @abd_subnsw_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
376; AVX512BW-LABEL: abd_subnsw_v32i16:
377; AVX512BW:       # %bb.0:
378; AVX512BW-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
379; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
380; AVX512BW-NEXT:    retq
381;
382; AVX512DQ-LABEL: abd_subnsw_v32i16:
383; AVX512DQ:       # %bb.0:
384; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
385; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
386; AVX512DQ-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
387; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
388; AVX512DQ-NEXT:    vpabsw %ymm0, %ymm0
389; AVX512DQ-NEXT:    vpabsw %ymm2, %ymm1
390; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
391; AVX512DQ-NEXT:    retq
392  %sub = sub nsw <32 x i16> %a, %b
393  %abs = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %sub, i1 false)
394  ret <32 x i16> %abs
395}
396
397define <16 x i32> @abd_subnsw_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
398; AVX512-LABEL: abd_subnsw_v16i32:
399; AVX512:       # %bb.0:
400; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
401; AVX512-NEXT:    vpabsd %zmm0, %zmm0
402; AVX512-NEXT:    retq
403  %sub = sub nsw <16 x i32> %a, %b
404  %abs = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %sub, i1 false)
405  ret <16 x i32> %abs
406}
407
408define <8 x i64> @abd_subnsw_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
409; AVX512-LABEL: abd_subnsw_v8i64:
410; AVX512:       # %bb.0:
411; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
412; AVX512-NEXT:    vpabsq %zmm0, %zmm0
413; AVX512-NEXT:    retq
414  %sub = sub nsw <8 x i64> %a, %b
415  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false)
416  ret <8 x i64> %abs
417}
418
419declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1)
420declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1)
421declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
422declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
423declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
424declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
425declare <64 x i64> @llvm.abs.v64i64(<64 x i64>, i1)
426declare <8 x i128> @llvm.abs.v8i128(<8 x i128>, i1)
427
428declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>)
429declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>)
430declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
431declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>)
432
433declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>)
434declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>)
435declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>)
436declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>)
437