xref: /llvm-project/llvm/test/CodeGen/X86/abdu-vector-256.ll (revision aefd2572a504d675ef623d2f3d61364232b19f26)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX1
3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
4; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
5
6;
7; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b)
8;
9
10define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
11; AVX1-LABEL: abd_ext_v32i8:
12; AVX1:       # %bb.0:
13; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
14; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
15; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
16; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
17; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
18; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
19; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
20; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
21; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
22; AVX1-NEXT:    retq
23;
24; AVX2-LABEL: abd_ext_v32i8:
25; AVX2:       # %bb.0:
26; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm2
27; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
28; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
29; AVX2-NEXT:    retq
30;
31; AVX512-LABEL: abd_ext_v32i8:
32; AVX512:       # %bb.0:
33; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm2
34; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
35; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
36; AVX512-NEXT:    retq
37  %aext = zext <32 x i8> %a to <32 x i64>
38  %bext = zext <32 x i8> %b to <32 x i64>
39  %sub = sub <32 x i64> %aext, %bext
40  %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false)
41  %trunc = trunc <32 x i64> %abs to <32 x i8>
42  ret <32 x i8> %trunc
43}
44
45define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind {
46; AVX1-LABEL: abd_ext_v32i8_undef:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
49; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
50; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
51; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
52; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
53; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
54; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
55; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
56; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
57; AVX1-NEXT:    retq
58;
59; AVX2-LABEL: abd_ext_v32i8_undef:
60; AVX2:       # %bb.0:
61; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm2
62; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
63; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
64; AVX2-NEXT:    retq
65;
66; AVX512-LABEL: abd_ext_v32i8_undef:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm2
69; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
70; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
71; AVX512-NEXT:    retq
72  %aext = zext <32 x i8> %a to <32 x i64>
73  %bext = zext <32 x i8> %b to <32 x i64>
74  %sub = sub <32 x i64> %aext, %bext
75  %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true)
76  %trunc = trunc <32 x i64> %abs to <32 x i8>
77  ret <32 x i8> %trunc
78}
79
80define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
81; AVX1-LABEL: abd_ext_v16i16:
82; AVX1:       # %bb.0:
83; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
84; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
85; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
86; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
87; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
88; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
89; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
90; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
91; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
92; AVX1-NEXT:    retq
93;
94; AVX2-LABEL: abd_ext_v16i16:
95; AVX2:       # %bb.0:
96; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
97; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
98; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
99; AVX2-NEXT:    retq
100;
101; AVX512-LABEL: abd_ext_v16i16:
102; AVX512:       # %bb.0:
103; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
104; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
105; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
106; AVX512-NEXT:    retq
107  %aext = zext <16 x i16> %a to <16 x i64>
108  %bext = zext <16 x i16> %b to <16 x i64>
109  %sub = sub <16 x i64> %aext, %bext
110  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false)
111  %trunc = trunc <16 x i64> %abs to <16 x i16>
112  ret <16 x i16> %trunc
113}
114
115define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind {
116; AVX1-LABEL: abd_ext_v16i16_undef:
117; AVX1:       # %bb.0:
118; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
119; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
120; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
121; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
122; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
123; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
124; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
125; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
126; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
127; AVX1-NEXT:    retq
128;
129; AVX2-LABEL: abd_ext_v16i16_undef:
130; AVX2:       # %bb.0:
131; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
132; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
133; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
134; AVX2-NEXT:    retq
135;
136; AVX512-LABEL: abd_ext_v16i16_undef:
137; AVX512:       # %bb.0:
138; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
139; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
140; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
141; AVX512-NEXT:    retq
142  %aext = zext <16 x i16> %a to <16 x i64>
143  %bext = zext <16 x i16> %b to <16 x i64>
144  %sub = sub <16 x i64> %aext, %bext
145  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true)
146  %trunc = trunc <16 x i64> %abs to <16 x i16>
147  ret <16 x i16> %trunc
148}
149
150define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
151; AVX1-LABEL: abd_ext_v8i32:
152; AVX1:       # %bb.0:
153; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
154; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
155; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
156; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
157; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
158; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
159; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
160; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
161; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
162; AVX1-NEXT:    retq
163;
164; AVX2-LABEL: abd_ext_v8i32:
165; AVX2:       # %bb.0:
166; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
167; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
168; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
169; AVX2-NEXT:    retq
170;
171; AVX512-LABEL: abd_ext_v8i32:
172; AVX512:       # %bb.0:
173; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm2
174; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
175; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
176; AVX512-NEXT:    retq
177  %aext = zext <8 x i32> %a to <8 x i64>
178  %bext = zext <8 x i32> %b to <8 x i64>
179  %sub = sub <8 x i64> %aext, %bext
180  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false)
181  %trunc = trunc <8 x i64> %abs to <8 x i32>
182  ret <8 x i32> %trunc
183}
184
185define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind {
186; AVX1-LABEL: abd_ext_v8i32_undef:
187; AVX1:       # %bb.0:
188; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
189; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
190; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
191; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
192; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
193; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
194; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
195; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
196; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
197; AVX1-NEXT:    retq
198;
199; AVX2-LABEL: abd_ext_v8i32_undef:
200; AVX2:       # %bb.0:
201; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
202; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
203; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
204; AVX2-NEXT:    retq
205;
206; AVX512-LABEL: abd_ext_v8i32_undef:
207; AVX512:       # %bb.0:
208; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm2
209; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
210; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
211; AVX512-NEXT:    retq
212  %aext = zext <8 x i32> %a to <8 x i64>
213  %bext = zext <8 x i32> %b to <8 x i64>
214  %sub = sub <8 x i64> %aext, %bext
215  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true)
216  %trunc = trunc <8 x i64> %abs to <8 x i32>
217  ret <8 x i32> %trunc
218}
219
220define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
221; AVX1-LABEL: abd_ext_v4i64:
222; AVX1:       # %bb.0:
223; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
224; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
225; AVX1-NEXT:    # xmm3 = mem[0,0]
226; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
227; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
228; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
229; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
230; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
231; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
232; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
233; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
234; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
235; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
236; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
237; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
238; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
239; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
240; AVX1-NEXT:    retq
241;
242; AVX2-LABEL: abd_ext_v4i64:
243; AVX2:       # %bb.0:
244; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
245; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
246; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
247; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
248; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
249; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
250; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
251; AVX2-NEXT:    retq
252;
253; AVX512-LABEL: abd_ext_v4i64:
254; AVX512:       # %bb.0:
255; AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
256; AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
257; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
258; AVX512-NEXT:    retq
259  %aext = zext <4 x i64> %a to <4 x i128>
260  %bext = zext <4 x i64> %b to <4 x i128>
261  %sub = sub <4 x i128> %aext, %bext
262  %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 false)
263  %trunc = trunc <4 x i128> %abs to <4 x i64>
264  ret <4 x i64> %trunc
265}
266
267define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
268; AVX1-LABEL: abd_ext_v4i64_undef:
269; AVX1:       # %bb.0:
270; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
271; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
272; AVX1-NEXT:    # xmm3 = mem[0,0]
273; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
274; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
275; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
276; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
277; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
278; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
279; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
280; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
281; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
282; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
283; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
284; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
285; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
286; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
287; AVX1-NEXT:    retq
288;
289; AVX2-LABEL: abd_ext_v4i64_undef:
290; AVX2:       # %bb.0:
291; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
292; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
293; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
294; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
295; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
296; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
297; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
298; AVX2-NEXT:    retq
299;
300; AVX512-LABEL: abd_ext_v4i64_undef:
301; AVX512:       # %bb.0:
302; AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
303; AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
304; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
305; AVX512-NEXT:    retq
306  %aext = zext <4 x i64> %a to <4 x i128>
307  %bext = zext <4 x i64> %b to <4 x i128>
308  %sub = sub <4 x i128> %aext, %bext
309  %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 true)
310  %trunc = trunc <4 x i128> %abs to <4 x i64>
311  ret <4 x i64> %trunc
312}
313
314;
315; sub(umax(a,b),umin(a,b)) -> abdu(a,b)
316;
317
318define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
319; AVX1-LABEL: abd_minmax_v32i8:
320; AVX1:       # %bb.0:
321; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
322; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
323; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
324; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
325; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
326; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
327; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
328; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
329; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
330; AVX1-NEXT:    retq
331;
332; AVX2-LABEL: abd_minmax_v32i8:
333; AVX2:       # %bb.0:
334; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm2
335; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
336; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
337; AVX2-NEXT:    retq
338;
339; AVX512-LABEL: abd_minmax_v32i8:
340; AVX512:       # %bb.0:
341; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm2
342; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
343; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
344; AVX512-NEXT:    retq
345  %min = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a, <32 x i8> %b)
346  %max = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %a, <32 x i8> %b)
347  %sub = sub <32 x i8> %max, %min
348  ret <32 x i8> %sub
349}
350
351define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
352; AVX1-LABEL: abd_minmax_v16i16:
353; AVX1:       # %bb.0:
354; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
355; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
356; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
357; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
358; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
359; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
360; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
361; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
362; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
363; AVX1-NEXT:    retq
364;
365; AVX2-LABEL: abd_minmax_v16i16:
366; AVX2:       # %bb.0:
367; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
368; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
369; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
370; AVX2-NEXT:    retq
371;
372; AVX512-LABEL: abd_minmax_v16i16:
373; AVX512:       # %bb.0:
374; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
375; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
376; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
377; AVX512-NEXT:    retq
378  %min = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a, <16 x i16> %b)
379  %max = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %a, <16 x i16> %b)
380  %sub = sub <16 x i16> %max, %min
381  ret <16 x i16> %sub
382}
383
384define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
385; AVX1-LABEL: abd_minmax_v8i32:
386; AVX1:       # %bb.0:
387; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
388; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
389; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
390; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
391; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
392; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
393; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
394; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
395; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
396; AVX1-NEXT:    retq
397;
398; AVX2-LABEL: abd_minmax_v8i32:
399; AVX2:       # %bb.0:
400; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
401; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
402; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
403; AVX2-NEXT:    retq
404;
405; AVX512-LABEL: abd_minmax_v8i32:
406; AVX512:       # %bb.0:
407; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm2
408; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
409; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
410; AVX512-NEXT:    retq
411  %min = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b)
412  %max = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b)
413  %sub = sub <8 x i32> %max, %min
414  ret <8 x i32> %sub
415}
416
417define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
418; AVX1-LABEL: abd_minmax_v4i64:
419; AVX1:       # %bb.0:
420; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
421; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
422; AVX1-NEXT:    # xmm3 = mem[0,0]
423; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
424; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
425; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
426; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
427; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
428; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
429; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
430; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
431; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
432; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
433; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
434; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
435; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
436; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
437; AVX1-NEXT:    retq
438;
439; AVX2-LABEL: abd_minmax_v4i64:
440; AVX2:       # %bb.0:
441; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
442; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
443; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
444; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
445; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
446; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
447; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
448; AVX2-NEXT:    retq
449;
450; AVX512-LABEL: abd_minmax_v4i64:
451; AVX512:       # %bb.0:
452; AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
453; AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
454; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
455; AVX512-NEXT:    retq
456  %min = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b)
457  %max = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b)
458  %sub = sub <4 x i64> %max, %min
459  ret <4 x i64> %sub
460}
461
462;
463; select(icmp(a,b),sub(a,b),sub(b,a)) -> abdu(a,b)
464;
465
466define <32 x i8> @abd_cmp_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
467; AVX1-LABEL: abd_cmp_v32i8:
468; AVX1:       # %bb.0:
469; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
470; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
471; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
472; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
473; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
474; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
475; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
476; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
477; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
478; AVX1-NEXT:    retq
479;
480; AVX2-LABEL: abd_cmp_v32i8:
481; AVX2:       # %bb.0:
482; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm2
483; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
484; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
485; AVX2-NEXT:    retq
486;
487; AVX512-LABEL: abd_cmp_v32i8:
488; AVX512:       # %bb.0:
489; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm2
490; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
491; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
492; AVX512-NEXT:    retq
493  %cmp = icmp ugt <32 x i8> %a, %b
494  %ab = sub <32 x i8> %a, %b
495  %ba = sub <32 x i8> %b, %a
496  %sel = select <32 x i1> %cmp, <32 x i8> %ab, <32 x i8> %ba
497  ret <32 x i8> %sel
498}
499
500define <16 x i16> @abd_cmp_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
501; AVX1-LABEL: abd_cmp_v16i16:
502; AVX1:       # %bb.0:
503; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
504; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
505; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
506; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
507; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
508; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
509; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
510; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
511; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
512; AVX1-NEXT:    retq
513;
514; AVX2-LABEL: abd_cmp_v16i16:
515; AVX2:       # %bb.0:
516; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
517; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
518; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
519; AVX2-NEXT:    retq
520;
521; AVX512-LABEL: abd_cmp_v16i16:
522; AVX512:       # %bb.0:
523; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
524; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
525; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
526; AVX512-NEXT:    retq
527  %cmp = icmp uge <16 x i16> %a, %b
528  %ab = sub <16 x i16> %a, %b
529  %ba = sub <16 x i16> %b, %a
530  %sel = select <16 x i1> %cmp, <16 x i16> %ab, <16 x i16> %ba
531  ret <16 x i16> %sel
532}
533
534define <8 x i32> @abd_cmp_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
535; AVX1-LABEL: abd_cmp_v8i32:
536; AVX1:       # %bb.0:
537; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
538; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
539; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
540; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
541; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
542; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
543; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
544; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
545; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
546; AVX1-NEXT:    retq
547;
548; AVX2-LABEL: abd_cmp_v8i32:
549; AVX2:       # %bb.0:
550; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
551; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
552; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
553; AVX2-NEXT:    retq
554;
555; AVX512-LABEL: abd_cmp_v8i32:
556; AVX512:       # %bb.0:
557; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm2
558; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
559; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
560; AVX512-NEXT:    retq
561  %cmp = icmp ult <8 x i32> %a, %b
562  %ab = sub <8 x i32> %a, %b
563  %ba = sub <8 x i32> %b, %a
564  %sel = select <8 x i1> %cmp, <8 x i32> %ba, <8 x i32> %ab
565  ret <8 x i32> %sel
566}
567
568define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
569; AVX1-LABEL: abd_cmp_v4i64:
570; AVX1:       # %bb.0:
571; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
572; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
573; AVX1-NEXT:    # xmm3 = mem[0,0]
574; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
576; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
577; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
578; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
579; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
580; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
581; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
582; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
583; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
584; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
585; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
586; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
587; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
588; AVX1-NEXT:    retq
589;
590; AVX2-LABEL: abd_cmp_v4i64:
591; AVX2:       # %bb.0:
592; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
593; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
594; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
595; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
596; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
597; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
598; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
599; AVX2-NEXT:    retq
600;
601; AVX512-LABEL: abd_cmp_v4i64:
602; AVX512:       # %bb.0:
603; AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
604; AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
605; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
606; AVX512-NEXT:    retq
607  %cmp = icmp uge <4 x i64> %a, %b
608  %ab = sub <4 x i64> %a, %b
609  %ba = sub <4 x i64> %b, %a
610  %sel = select <4 x i1> %cmp, <4 x i64> %ab, <4 x i64> %ba
611  ret <4 x i64> %sel
612}
613
614declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
615declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
616declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
617declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
618declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
619declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
620declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
621declare <4 x i128> @llvm.abs.v4i128(<4 x i128>, i1)
622
623declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
624declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
625declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
626declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
627
628declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
629declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
630declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
631declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>)
632