xref: /llvm-project/llvm/test/CodeGen/X86/abds-vector-256.ll (revision aefd2572a504d675ef623d2f3d61364232b19f26)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX1
3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
4; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
5
6;
7; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b)
8;
9
10define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
11; AVX1-LABEL: abd_ext_v32i8:
12; AVX1:       # %bb.0:
13; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
14; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
15; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
16; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
17; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
18; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
19; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
20; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
21; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
22; AVX1-NEXT:    retq
23;
24; AVX2-LABEL: abd_ext_v32i8:
25; AVX2:       # %bb.0:
26; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
27; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
28; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
29; AVX2-NEXT:    retq
30;
31; AVX512-LABEL: abd_ext_v32i8:
32; AVX512:       # %bb.0:
33; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
34; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
35; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
36; AVX512-NEXT:    retq
37  %aext = sext <32 x i8> %a to <32 x i64>
38  %bext = sext <32 x i8> %b to <32 x i64>
39  %sub = sub <32 x i64> %aext, %bext
40  %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false)
41  %trunc = trunc <32 x i64> %abs to <32 x i8>
42  ret <32 x i8> %trunc
43}
44
45define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind {
46; AVX1-LABEL: abd_ext_v32i8_undef:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
49; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
50; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
51; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
52; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
53; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
54; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
55; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
56; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
57; AVX1-NEXT:    retq
58;
59; AVX2-LABEL: abd_ext_v32i8_undef:
60; AVX2:       # %bb.0:
61; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
62; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
63; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
64; AVX2-NEXT:    retq
65;
66; AVX512-LABEL: abd_ext_v32i8_undef:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
69; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
70; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
71; AVX512-NEXT:    retq
72  %aext = sext <32 x i8> %a to <32 x i64>
73  %bext = sext <32 x i8> %b to <32 x i64>
74  %sub = sub <32 x i64> %aext, %bext
75  %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true)
76  %trunc = trunc <32 x i64> %abs to <32 x i8>
77  ret <32 x i8> %trunc
78}
79
80define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
81; AVX1-LABEL: abd_ext_v16i16:
82; AVX1:       # %bb.0:
83; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
84; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
85; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
86; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
87; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
88; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
89; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
90; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
91; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
92; AVX1-NEXT:    retq
93;
94; AVX2-LABEL: abd_ext_v16i16:
95; AVX2:       # %bb.0:
96; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
97; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
98; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
99; AVX2-NEXT:    retq
100;
101; AVX512-LABEL: abd_ext_v16i16:
102; AVX512:       # %bb.0:
103; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
104; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
105; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
106; AVX512-NEXT:    retq
107  %aext = sext <16 x i16> %a to <16 x i64>
108  %bext = sext <16 x i16> %b to <16 x i64>
109  %sub = sub <16 x i64> %aext, %bext
110  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false)
111  %trunc = trunc <16 x i64> %abs to <16 x i16>
112  ret <16 x i16> %trunc
113}
114
115define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind {
116; AVX1-LABEL: abd_ext_v16i16_undef:
117; AVX1:       # %bb.0:
118; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
119; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
120; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
121; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
122; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
123; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
124; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
125; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
126; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
127; AVX1-NEXT:    retq
128;
129; AVX2-LABEL: abd_ext_v16i16_undef:
130; AVX2:       # %bb.0:
131; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
132; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
133; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
134; AVX2-NEXT:    retq
135;
136; AVX512-LABEL: abd_ext_v16i16_undef:
137; AVX512:       # %bb.0:
138; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
139; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
140; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
141; AVX512-NEXT:    retq
142  %aext = sext <16 x i16> %a to <16 x i64>
143  %bext = sext <16 x i16> %b to <16 x i64>
144  %sub = sub <16 x i64> %aext, %bext
145  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true)
146  %trunc = trunc <16 x i64> %abs to <16 x i16>
147  ret <16 x i16> %trunc
148}
149
150define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
151; AVX1-LABEL: abd_ext_v8i32:
152; AVX1:       # %bb.0:
153; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
154; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
155; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
156; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
157; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
158; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
159; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
160; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
161; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
162; AVX1-NEXT:    retq
163;
164; AVX2-LABEL: abd_ext_v8i32:
165; AVX2:       # %bb.0:
166; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
167; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
168; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
169; AVX2-NEXT:    retq
170;
171; AVX512-LABEL: abd_ext_v8i32:
172; AVX512:       # %bb.0:
173; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
174; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
175; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
176; AVX512-NEXT:    retq
177  %aext = sext <8 x i32> %a to <8 x i64>
178  %bext = sext <8 x i32> %b to <8 x i64>
179  %sub = sub <8 x i64> %aext, %bext
180  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false)
181  %trunc = trunc <8 x i64> %abs to <8 x i32>
182  ret <8 x i32> %trunc
183}
184
185define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind {
186; AVX1-LABEL: abd_ext_v8i32_undef:
187; AVX1:       # %bb.0:
188; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
189; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
190; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
191; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
192; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
193; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
194; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
195; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
196; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
197; AVX1-NEXT:    retq
198;
199; AVX2-LABEL: abd_ext_v8i32_undef:
200; AVX2:       # %bb.0:
201; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
202; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
203; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
204; AVX2-NEXT:    retq
205;
206; AVX512-LABEL: abd_ext_v8i32_undef:
207; AVX512:       # %bb.0:
208; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
209; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
210; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
211; AVX512-NEXT:    retq
212  %aext = sext <8 x i32> %a to <8 x i64>
213  %bext = sext <8 x i32> %b to <8 x i64>
214  %sub = sub <8 x i64> %aext, %bext
215  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true)
216  %trunc = trunc <8 x i64> %abs to <8 x i32>
217  ret <8 x i32> %trunc
218}
219
220define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
221; AVX1-LABEL: abd_ext_v4i64:
222; AVX1:       # %bb.0:
223; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
224; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
225; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
226; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
227; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
228; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
229; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
230; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
231; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
232; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
233; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
234; AVX1-NEXT:    retq
235;
236; AVX2-LABEL: abd_ext_v4i64:
237; AVX2:       # %bb.0:
238; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
239; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
240; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
241; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
242; AVX2-NEXT:    retq
243;
244; AVX512-LABEL: abd_ext_v4i64:
245; AVX512:       # %bb.0:
246; AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
247; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
248; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
249; AVX512-NEXT:    retq
250  %aext = sext <4 x i64> %a to <4 x i128>
251  %bext = sext <4 x i64> %b to <4 x i128>
252  %sub = sub <4 x i128> %aext, %bext
253  %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 false)
254  %trunc = trunc <4 x i128> %abs to <4 x i64>
255  ret <4 x i64> %trunc
256}
257
258define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
259; AVX1-LABEL: abd_ext_v4i64_undef:
260; AVX1:       # %bb.0:
261; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
262; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
263; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
264; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
265; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
266; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
267; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
268; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
269; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
270; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
271; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
272; AVX1-NEXT:    retq
273;
274; AVX2-LABEL: abd_ext_v4i64_undef:
275; AVX2:       # %bb.0:
276; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
277; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
278; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
279; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
280; AVX2-NEXT:    retq
281;
282; AVX512-LABEL: abd_ext_v4i64_undef:
283; AVX512:       # %bb.0:
284; AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
285; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
286; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
287; AVX512-NEXT:    retq
288  %aext = sext <4 x i64> %a to <4 x i128>
289  %bext = sext <4 x i64> %b to <4 x i128>
290  %sub = sub <4 x i128> %aext, %bext
291  %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 true)
292  %trunc = trunc <4 x i128> %abs to <4 x i64>
293  ret <4 x i64> %trunc
294}
295
296;
297; sub(smax(a,b),smin(a,b)) -> abds(a,b)
298;
299
300define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
301; AVX1-LABEL: abd_minmax_v32i8:
302; AVX1:       # %bb.0:
303; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
304; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
305; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
306; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
307; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
308; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
309; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
310; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
311; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
312; AVX1-NEXT:    retq
313;
314; AVX2-LABEL: abd_minmax_v32i8:
315; AVX2:       # %bb.0:
316; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
317; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
318; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
319; AVX2-NEXT:    retq
320;
321; AVX512-LABEL: abd_minmax_v32i8:
322; AVX512:       # %bb.0:
323; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
324; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
325; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
326; AVX512-NEXT:    retq
327  %min = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %a, <32 x i8> %b)
328  %max = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %a, <32 x i8> %b)
329  %sub = sub <32 x i8> %max, %min
330  ret <32 x i8> %sub
331}
332
333define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
334; AVX1-LABEL: abd_minmax_v16i16:
335; AVX1:       # %bb.0:
336; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
337; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
338; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
339; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
340; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
341; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
342; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
343; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
344; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
345; AVX1-NEXT:    retq
346;
347; AVX2-LABEL: abd_minmax_v16i16:
348; AVX2:       # %bb.0:
349; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
350; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
351; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
352; AVX2-NEXT:    retq
353;
354; AVX512-LABEL: abd_minmax_v16i16:
355; AVX512:       # %bb.0:
356; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
357; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
358; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
359; AVX512-NEXT:    retq
360  %min = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %a, <16 x i16> %b)
361  %max = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %a, <16 x i16> %b)
362  %sub = sub <16 x i16> %max, %min
363  ret <16 x i16> %sub
364}
365
366define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
367; AVX1-LABEL: abd_minmax_v8i32:
368; AVX1:       # %bb.0:
369; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
370; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
371; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
372; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
373; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
374; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
375; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
376; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
377; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
378; AVX1-NEXT:    retq
379;
380; AVX2-LABEL: abd_minmax_v8i32:
381; AVX2:       # %bb.0:
382; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
383; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
384; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
385; AVX2-NEXT:    retq
386;
387; AVX512-LABEL: abd_minmax_v8i32:
388; AVX512:       # %bb.0:
389; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
390; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
391; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
392; AVX512-NEXT:    retq
393  %min = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b)
394  %max = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b)
395  %sub = sub <8 x i32> %max, %min
396  ret <8 x i32> %sub
397}
398
399define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
400; AVX1-LABEL: abd_minmax_v4i64:
401; AVX1:       # %bb.0:
402; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
403; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
404; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
405; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
406; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
407; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
408; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
409; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
410; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
411; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
412; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
413; AVX1-NEXT:    retq
414;
415; AVX2-LABEL: abd_minmax_v4i64:
416; AVX2:       # %bb.0:
417; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
418; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
419; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
420; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
421; AVX2-NEXT:    retq
422;
423; AVX512-LABEL: abd_minmax_v4i64:
424; AVX512:       # %bb.0:
425; AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
426; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
427; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
428; AVX512-NEXT:    retq
429  %min = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b)
430  %max = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b)
431  %sub = sub <4 x i64> %max, %min
432  ret <4 x i64> %sub
433}
434
435;
436; select(icmp(a,b),sub(a,b),sub(b,a)) -> abds(a,b)
437;
438
439define <32 x i8> @abd_cmp_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
440; AVX1-LABEL: abd_cmp_v32i8:
441; AVX1:       # %bb.0:
442; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
443; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
444; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
445; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
446; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
447; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
448; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
449; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
450; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
451; AVX1-NEXT:    retq
452;
453; AVX2-LABEL: abd_cmp_v32i8:
454; AVX2:       # %bb.0:
455; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
456; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
457; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
458; AVX2-NEXT:    retq
459;
460; AVX512-LABEL: abd_cmp_v32i8:
461; AVX512:       # %bb.0:
462; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
463; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
464; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
465; AVX512-NEXT:    retq
466  %cmp = icmp sgt <32 x i8> %a, %b
467  %ab = sub <32 x i8> %a, %b
468  %ba = sub <32 x i8> %b, %a
469  %sel = select <32 x i1> %cmp, <32 x i8> %ab, <32 x i8> %ba
470  ret <32 x i8> %sel
471}
472
473define <16 x i16> @abd_cmp_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
474; AVX1-LABEL: abd_cmp_v16i16:
475; AVX1:       # %bb.0:
476; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
477; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
478; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
479; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
480; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
481; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
482; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
483; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
484; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
485; AVX1-NEXT:    retq
486;
487; AVX2-LABEL: abd_cmp_v16i16:
488; AVX2:       # %bb.0:
489; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
490; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
491; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
492; AVX2-NEXT:    retq
493;
494; AVX512-LABEL: abd_cmp_v16i16:
495; AVX512:       # %bb.0:
496; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
497; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
498; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
499; AVX512-NEXT:    retq
500  %cmp = icmp sge <16 x i16> %a, %b
501  %ab = sub <16 x i16> %a, %b
502  %ba = sub <16 x i16> %b, %a
503  %sel = select <16 x i1> %cmp, <16 x i16> %ab, <16 x i16> %ba
504  ret <16 x i16> %sel
505}
506
507define <8 x i32> @abd_cmp_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
508; AVX1-LABEL: abd_cmp_v8i32:
509; AVX1:       # %bb.0:
510; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
511; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
512; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
513; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
514; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
515; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
516; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
517; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
518; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
519; AVX1-NEXT:    retq
520;
521; AVX2-LABEL: abd_cmp_v8i32:
522; AVX2:       # %bb.0:
523; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
524; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
525; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
526; AVX2-NEXT:    retq
527;
528; AVX512-LABEL: abd_cmp_v8i32:
529; AVX512:       # %bb.0:
530; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
531; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
532; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
533; AVX512-NEXT:    retq
534  %cmp = icmp slt <8 x i32> %a, %b
535  %ab = sub <8 x i32> %a, %b
536  %ba = sub <8 x i32> %b, %a
537  %sel = select <8 x i1> %cmp, <8 x i32> %ba, <8 x i32> %ab
538  ret <8 x i32> %sel
539}
540
541define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
542; AVX1-LABEL: abd_cmp_v4i64:
543; AVX1:       # %bb.0:
544; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
545; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
546; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
547; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
548; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
549; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
550; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
551; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
552; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
553; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
554; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
555; AVX1-NEXT:    retq
556;
557; AVX2-LABEL: abd_cmp_v4i64:
558; AVX2:       # %bb.0:
559; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
560; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
561; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
562; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
563; AVX2-NEXT:    retq
564;
565; AVX512-LABEL: abd_cmp_v4i64:
566; AVX512:       # %bb.0:
567; AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
568; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
569; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
570; AVX512-NEXT:    retq
571  %cmp = icmp sge <4 x i64> %a, %b
572  %ab = sub <4 x i64> %a, %b
573  %ba = sub <4 x i64> %b, %a
574  %sel = select <4 x i1> %cmp, <4 x i64> %ab, <4 x i64> %ba
575  ret <4 x i64> %sel
576}
577
578;
579; abs(sub_nsw(x, y)) -> abds(a,b)
580;
581
582define <32 x i8> @abd_subnsw_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
583; AVX1-LABEL: abd_subnsw_v32i8:
584; AVX1:       # %bb.0:
585; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
586; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
587; AVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
588; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
589; AVX1-NEXT:    vpabsb %xmm0, %xmm0
590; AVX1-NEXT:    vpabsb %xmm2, %xmm1
591; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
592; AVX1-NEXT:    retq
593;
594; AVX2-LABEL: abd_subnsw_v32i8:
595; AVX2:       # %bb.0:
596; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
597; AVX2-NEXT:    vpabsb %ymm0, %ymm0
598; AVX2-NEXT:    retq
599;
600; AVX512-LABEL: abd_subnsw_v32i8:
601; AVX512:       # %bb.0:
602; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
603; AVX512-NEXT:    vpabsb %ymm0, %ymm0
604; AVX512-NEXT:    retq
605  %sub = sub nsw <32 x i8> %a, %b
606  %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %sub, i1 false)
607  ret <32 x i8> %abs
608}
609
610define <16 x i16> @abd_subnsw_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
611; AVX1-LABEL: abd_subnsw_v16i16:
612; AVX1:       # %bb.0:
613; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
614; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
615; AVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
616; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
617; AVX1-NEXT:    vpabsw %xmm0, %xmm0
618; AVX1-NEXT:    vpabsw %xmm2, %xmm1
619; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
620; AVX1-NEXT:    retq
621;
622; AVX2-LABEL: abd_subnsw_v16i16:
623; AVX2:       # %bb.0:
624; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
625; AVX2-NEXT:    vpabsw %ymm0, %ymm0
626; AVX2-NEXT:    retq
627;
628; AVX512-LABEL: abd_subnsw_v16i16:
629; AVX512:       # %bb.0:
630; AVX512-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
631; AVX512-NEXT:    vpabsw %ymm0, %ymm0
632; AVX512-NEXT:    retq
633  %sub = sub nsw <16 x i16> %a, %b
634  %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 false)
635  ret <16 x i16> %abs
636}
637
638define <8 x i32> @abd_subnsw_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
639; AVX1-LABEL: abd_subnsw_v8i32:
640; AVX1:       # %bb.0:
641; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
642; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
643; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
644; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
645; AVX1-NEXT:    vpabsd %xmm0, %xmm0
646; AVX1-NEXT:    vpabsd %xmm2, %xmm1
647; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
648; AVX1-NEXT:    retq
649;
650; AVX2-LABEL: abd_subnsw_v8i32:
651; AVX2:       # %bb.0:
652; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
653; AVX2-NEXT:    vpabsd %ymm0, %ymm0
654; AVX2-NEXT:    retq
655;
656; AVX512-LABEL: abd_subnsw_v8i32:
657; AVX512:       # %bb.0:
658; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
659; AVX512-NEXT:    vpabsd %ymm0, %ymm0
660; AVX512-NEXT:    retq
661  %sub = sub nsw <8 x i32> %a, %b
662  %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 false)
663  ret <8 x i32> %abs
664}
665
666define <4 x i64> @abd_subnsw_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
667; AVX1-LABEL: abd_subnsw_v4i64:
668; AVX1:       # %bb.0:
669; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
670; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
671; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
672; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
673; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
674; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
675; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
676; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
677; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
678; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
679; AVX1-NEXT:    retq
680;
681; AVX2-LABEL: abd_subnsw_v4i64:
682; AVX2:       # %bb.0:
683; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
684; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
685; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
686; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
687; AVX2-NEXT:    retq
688;
689; AVX512-LABEL: abd_subnsw_v4i64:
690; AVX512:       # %bb.0:
691; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
692; AVX512-NEXT:    vpabsq %ymm0, %ymm0
693; AVX512-NEXT:    retq
694  %sub = sub nsw <4 x i64> %a, %b
695  %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false)
696  ret <4 x i64> %abs
697}
698
699declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
700declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
701declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
702declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
703declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
704declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
705declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
706declare <4 x i128> @llvm.abs.v4i128(<4 x i128>, i1)
707
708declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
709declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
710declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
711declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
712
713declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
714declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
715declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
716declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
717