xref: /llvm-project/llvm/test/CodeGen/X86/abds-vector-128.ll (revision aefd2572a504d675ef623d2f3d61364232b19f26)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-linux                  | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.2   | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
7
8;
9; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b)
10;
11
12define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
13; SSE2-LABEL: abd_ext_v16i8:
14; SSE2:       # %bb.0:
15; SSE2-NEXT:    movdqa %xmm0, %xmm2
16; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
17; SSE2-NEXT:    psubb %xmm1, %xmm0
18; SSE2-NEXT:    pxor %xmm2, %xmm0
19; SSE2-NEXT:    psubb %xmm0, %xmm2
20; SSE2-NEXT:    movdqa %xmm2, %xmm0
21; SSE2-NEXT:    retq
22;
23; SSE42-LABEL: abd_ext_v16i8:
24; SSE42:       # %bb.0:
25; SSE42-NEXT:    movdqa %xmm0, %xmm2
26; SSE42-NEXT:    pminsb %xmm1, %xmm2
27; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
28; SSE42-NEXT:    psubb %xmm2, %xmm0
29; SSE42-NEXT:    retq
30;
31; AVX-LABEL: abd_ext_v16i8:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
34; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
35; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
36; AVX-NEXT:    retq
37  %aext = sext <16 x i8> %a to <16 x i64>
38  %bext = sext <16 x i8> %b to <16 x i64>
39  %sub = sub <16 x i64> %aext, %bext
40  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false)
41  %trunc = trunc <16 x i64> %abs to <16 x i8>
42  ret <16 x i8> %trunc
43}
44
45define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
46; SSE2-LABEL: abd_ext_v16i8_undef:
47; SSE2:       # %bb.0:
48; SSE2-NEXT:    movdqa %xmm0, %xmm2
49; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
50; SSE2-NEXT:    psubb %xmm1, %xmm0
51; SSE2-NEXT:    pxor %xmm2, %xmm0
52; SSE2-NEXT:    psubb %xmm0, %xmm2
53; SSE2-NEXT:    movdqa %xmm2, %xmm0
54; SSE2-NEXT:    retq
55;
56; SSE42-LABEL: abd_ext_v16i8_undef:
57; SSE42:       # %bb.0:
58; SSE42-NEXT:    movdqa %xmm0, %xmm2
59; SSE42-NEXT:    pminsb %xmm1, %xmm2
60; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
61; SSE42-NEXT:    psubb %xmm2, %xmm0
62; SSE42-NEXT:    retq
63;
64; AVX-LABEL: abd_ext_v16i8_undef:
65; AVX:       # %bb.0:
66; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
67; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
68; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
69; AVX-NEXT:    retq
70  %aext = sext <16 x i8> %a to <16 x i64>
71  %bext = sext <16 x i8> %b to <16 x i64>
72  %sub = sub <16 x i64> %aext, %bext
73  %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true)
74  %trunc = trunc <16 x i64> %abs to <16 x i8>
75  ret <16 x i8> %trunc
76}
77
78define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
79; SSE-LABEL: abd_ext_v8i16:
80; SSE:       # %bb.0:
81; SSE-NEXT:    movdqa %xmm0, %xmm2
82; SSE-NEXT:    pminsw %xmm1, %xmm2
83; SSE-NEXT:    pmaxsw %xmm1, %xmm0
84; SSE-NEXT:    psubw %xmm2, %xmm0
85; SSE-NEXT:    retq
86;
87; AVX-LABEL: abd_ext_v8i16:
88; AVX:       # %bb.0:
89; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
90; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
91; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
92; AVX-NEXT:    retq
93  %aext = sext <8 x i16> %a to <8 x i64>
94  %bext = sext <8 x i16> %b to <8 x i64>
95  %sub = sub <8 x i64> %aext, %bext
96  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false)
97  %trunc = trunc <8 x i64> %abs to <8 x i16>
98  ret <8 x i16> %trunc
99}
100
101define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
102; SSE-LABEL: abd_ext_v8i16_undef:
103; SSE:       # %bb.0:
104; SSE-NEXT:    movdqa %xmm0, %xmm2
105; SSE-NEXT:    pminsw %xmm1, %xmm2
106; SSE-NEXT:    pmaxsw %xmm1, %xmm0
107; SSE-NEXT:    psubw %xmm2, %xmm0
108; SSE-NEXT:    retq
109;
110; AVX-LABEL: abd_ext_v8i16_undef:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
113; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
114; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
115; AVX-NEXT:    retq
116  %aext = sext <8 x i16> %a to <8 x i64>
117  %bext = sext <8 x i16> %b to <8 x i64>
118  %sub = sub <8 x i64> %aext, %bext
119  %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true)
120  %trunc = trunc <8 x i64> %abs to <8 x i16>
121  ret <8 x i16> %trunc
122}
123
124define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
125; SSE2-LABEL: abd_ext_v4i32:
126; SSE2:       # %bb.0:
127; SSE2-NEXT:    movdqa %xmm0, %xmm2
128; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
129; SSE2-NEXT:    psubd %xmm1, %xmm0
130; SSE2-NEXT:    pxor %xmm2, %xmm0
131; SSE2-NEXT:    psubd %xmm0, %xmm2
132; SSE2-NEXT:    movdqa %xmm2, %xmm0
133; SSE2-NEXT:    retq
134;
135; SSE42-LABEL: abd_ext_v4i32:
136; SSE42:       # %bb.0:
137; SSE42-NEXT:    movdqa %xmm0, %xmm2
138; SSE42-NEXT:    pminsd %xmm1, %xmm2
139; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
140; SSE42-NEXT:    psubd %xmm2, %xmm0
141; SSE42-NEXT:    retq
142;
143; AVX-LABEL: abd_ext_v4i32:
144; AVX:       # %bb.0:
145; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
146; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
147; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
148; AVX-NEXT:    retq
149  %aext = sext <4 x i32> %a to <4 x i64>
150  %bext = sext <4 x i32> %b to <4 x i64>
151  %sub = sub <4 x i64> %aext, %bext
152  %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false)
153  %trunc = trunc <4 x i64> %abs to <4 x i32>
154  ret <4 x i32> %trunc
155}
156
157define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
158; SSE2-LABEL: abd_ext_v4i32_undef:
159; SSE2:       # %bb.0:
160; SSE2-NEXT:    movdqa %xmm0, %xmm2
161; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
162; SSE2-NEXT:    psubd %xmm1, %xmm0
163; SSE2-NEXT:    pxor %xmm2, %xmm0
164; SSE2-NEXT:    psubd %xmm0, %xmm2
165; SSE2-NEXT:    movdqa %xmm2, %xmm0
166; SSE2-NEXT:    retq
167;
168; SSE42-LABEL: abd_ext_v4i32_undef:
169; SSE42:       # %bb.0:
170; SSE42-NEXT:    movdqa %xmm0, %xmm2
171; SSE42-NEXT:    pminsd %xmm1, %xmm2
172; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
173; SSE42-NEXT:    psubd %xmm2, %xmm0
174; SSE42-NEXT:    retq
175;
176; AVX-LABEL: abd_ext_v4i32_undef:
177; AVX:       # %bb.0:
178; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
179; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
180; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
181; AVX-NEXT:    retq
182  %aext = sext <4 x i32> %a to <4 x i64>
183  %bext = sext <4 x i32> %b to <4 x i64>
184  %sub = sub <4 x i64> %aext, %bext
185  %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true)
186  %trunc = trunc <4 x i64> %abs to <4 x i32>
187  ret <4 x i32> %trunc
188}
189
190define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
191; SSE2-LABEL: abd_ext_v2i64:
192; SSE2:       # %bb.0:
193; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
194; SSE2-NEXT:    movdqa %xmm1, %xmm3
195; SSE2-NEXT:    pxor %xmm2, %xmm3
196; SSE2-NEXT:    pxor %xmm0, %xmm2
197; SSE2-NEXT:    movdqa %xmm2, %xmm4
198; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
199; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
200; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
201; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
202; SSE2-NEXT:    pand %xmm5, %xmm3
203; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
204; SSE2-NEXT:    por %xmm3, %xmm2
205; SSE2-NEXT:    psubq %xmm1, %xmm0
206; SSE2-NEXT:    pxor %xmm2, %xmm0
207; SSE2-NEXT:    psubq %xmm0, %xmm2
208; SSE2-NEXT:    movdqa %xmm2, %xmm0
209; SSE2-NEXT:    retq
210;
211; SSE42-LABEL: abd_ext_v2i64:
212; SSE42:       # %bb.0:
213; SSE42-NEXT:    movdqa %xmm0, %xmm2
214; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
215; SSE42-NEXT:    psubq %xmm1, %xmm0
216; SSE42-NEXT:    pxor %xmm2, %xmm0
217; SSE42-NEXT:    psubq %xmm0, %xmm2
218; SSE42-NEXT:    movdqa %xmm2, %xmm0
219; SSE42-NEXT:    retq
220;
221; AVX1-LABEL: abd_ext_v2i64:
222; AVX1:       # %bb.0:
223; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
224; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
225; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
226; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
227; AVX1-NEXT:    retq
228;
229; AVX2-LABEL: abd_ext_v2i64:
230; AVX2:       # %bb.0:
231; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
232; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
233; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
234; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
235; AVX2-NEXT:    retq
236;
237; AVX512-LABEL: abd_ext_v2i64:
238; AVX512:       # %bb.0:
239; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
240; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
241; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
242; AVX512-NEXT:    retq
243  %aext = sext <2 x i64> %a to <2 x i128>
244  %bext = sext <2 x i64> %b to <2 x i128>
245  %sub = sub <2 x i128> %aext, %bext
246  %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 false)
247  %trunc = trunc <2 x i128> %abs to <2 x i64>
248  ret <2 x i64> %trunc
249}
250
251define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
252; SSE2-LABEL: abd_ext_v2i64_undef:
253; SSE2:       # %bb.0:
254; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
255; SSE2-NEXT:    movdqa %xmm1, %xmm3
256; SSE2-NEXT:    pxor %xmm2, %xmm3
257; SSE2-NEXT:    pxor %xmm0, %xmm2
258; SSE2-NEXT:    movdqa %xmm2, %xmm4
259; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
260; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
261; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
262; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
263; SSE2-NEXT:    pand %xmm5, %xmm3
264; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
265; SSE2-NEXT:    por %xmm3, %xmm2
266; SSE2-NEXT:    psubq %xmm1, %xmm0
267; SSE2-NEXT:    pxor %xmm2, %xmm0
268; SSE2-NEXT:    psubq %xmm0, %xmm2
269; SSE2-NEXT:    movdqa %xmm2, %xmm0
270; SSE2-NEXT:    retq
271;
272; SSE42-LABEL: abd_ext_v2i64_undef:
273; SSE42:       # %bb.0:
274; SSE42-NEXT:    movdqa %xmm0, %xmm2
275; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
276; SSE42-NEXT:    psubq %xmm1, %xmm0
277; SSE42-NEXT:    pxor %xmm2, %xmm0
278; SSE42-NEXT:    psubq %xmm0, %xmm2
279; SSE42-NEXT:    movdqa %xmm2, %xmm0
280; SSE42-NEXT:    retq
281;
282; AVX1-LABEL: abd_ext_v2i64_undef:
283; AVX1:       # %bb.0:
284; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
285; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
286; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
287; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
288; AVX1-NEXT:    retq
289;
290; AVX2-LABEL: abd_ext_v2i64_undef:
291; AVX2:       # %bb.0:
292; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
293; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
294; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
295; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
296; AVX2-NEXT:    retq
297;
298; AVX512-LABEL: abd_ext_v2i64_undef:
299; AVX512:       # %bb.0:
300; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
301; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
302; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
303; AVX512-NEXT:    retq
304  %aext = sext <2 x i64> %a to <2 x i128>
305  %bext = sext <2 x i64> %b to <2 x i128>
306  %sub = sub <2 x i128> %aext, %bext
307  %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true)
308  %trunc = trunc <2 x i128> %abs to <2 x i64>
309  ret <2 x i64> %trunc
310}
311
312;
313; sub(smax(a,b),smin(a,b)) -> abds(a,b)
314;
315
316define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
317; SSE2-LABEL: abd_minmax_v16i8:
318; SSE2:       # %bb.0:
319; SSE2-NEXT:    movdqa %xmm0, %xmm2
320; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
321; SSE2-NEXT:    psubb %xmm1, %xmm0
322; SSE2-NEXT:    pxor %xmm2, %xmm0
323; SSE2-NEXT:    psubb %xmm0, %xmm2
324; SSE2-NEXT:    movdqa %xmm2, %xmm0
325; SSE2-NEXT:    retq
326;
327; SSE42-LABEL: abd_minmax_v16i8:
328; SSE42:       # %bb.0:
329; SSE42-NEXT:    movdqa %xmm0, %xmm2
330; SSE42-NEXT:    pminsb %xmm1, %xmm2
331; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
332; SSE42-NEXT:    psubb %xmm2, %xmm0
333; SSE42-NEXT:    retq
334;
335; AVX-LABEL: abd_minmax_v16i8:
336; AVX:       # %bb.0:
337; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
338; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
339; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
340; AVX-NEXT:    retq
341  %min = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b)
342  %max = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b)
343  %sub = sub <16 x i8> %max, %min
344  ret <16 x i8> %sub
345}
346
347define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
348; SSE-LABEL: abd_minmax_v8i16:
349; SSE:       # %bb.0:
350; SSE-NEXT:    movdqa %xmm0, %xmm2
351; SSE-NEXT:    pminsw %xmm1, %xmm2
352; SSE-NEXT:    pmaxsw %xmm1, %xmm0
353; SSE-NEXT:    psubw %xmm2, %xmm0
354; SSE-NEXT:    retq
355;
356; AVX-LABEL: abd_minmax_v8i16:
357; AVX:       # %bb.0:
358; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
359; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
360; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
361; AVX-NEXT:    retq
362  %min = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b)
363  %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b)
364  %sub = sub <8 x i16> %max, %min
365  ret <8 x i16> %sub
366}
367
368define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
369; SSE2-LABEL: abd_minmax_v4i32:
370; SSE2:       # %bb.0:
371; SSE2-NEXT:    movdqa %xmm0, %xmm2
372; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
373; SSE2-NEXT:    psubd %xmm1, %xmm0
374; SSE2-NEXT:    pxor %xmm2, %xmm0
375; SSE2-NEXT:    psubd %xmm0, %xmm2
376; SSE2-NEXT:    movdqa %xmm2, %xmm0
377; SSE2-NEXT:    retq
378;
379; SSE42-LABEL: abd_minmax_v4i32:
380; SSE42:       # %bb.0:
381; SSE42-NEXT:    movdqa %xmm0, %xmm2
382; SSE42-NEXT:    pminsd %xmm1, %xmm2
383; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
384; SSE42-NEXT:    psubd %xmm2, %xmm0
385; SSE42-NEXT:    retq
386;
387; AVX-LABEL: abd_minmax_v4i32:
388; AVX:       # %bb.0:
389; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
390; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
391; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
392; AVX-NEXT:    retq
393  %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
394  %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
395  %sub = sub <4 x i32> %max, %min
396  ret <4 x i32> %sub
397}
398
399define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
400; SSE2-LABEL: abd_minmax_v2i64:
401; SSE2:       # %bb.0:
402; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
403; SSE2-NEXT:    movdqa %xmm1, %xmm3
404; SSE2-NEXT:    pxor %xmm2, %xmm3
405; SSE2-NEXT:    pxor %xmm0, %xmm2
406; SSE2-NEXT:    movdqa %xmm2, %xmm4
407; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
408; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
409; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
410; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
411; SSE2-NEXT:    pand %xmm5, %xmm3
412; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
413; SSE2-NEXT:    por %xmm3, %xmm2
414; SSE2-NEXT:    psubq %xmm1, %xmm0
415; SSE2-NEXT:    pxor %xmm2, %xmm0
416; SSE2-NEXT:    psubq %xmm0, %xmm2
417; SSE2-NEXT:    movdqa %xmm2, %xmm0
418; SSE2-NEXT:    retq
419;
420; SSE42-LABEL: abd_minmax_v2i64:
421; SSE42:       # %bb.0:
422; SSE42-NEXT:    movdqa %xmm0, %xmm2
423; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
424; SSE42-NEXT:    psubq %xmm1, %xmm0
425; SSE42-NEXT:    pxor %xmm2, %xmm0
426; SSE42-NEXT:    psubq %xmm0, %xmm2
427; SSE42-NEXT:    movdqa %xmm2, %xmm0
428; SSE42-NEXT:    retq
429;
430; AVX1-LABEL: abd_minmax_v2i64:
431; AVX1:       # %bb.0:
432; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
433; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
434; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
435; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
436; AVX1-NEXT:    retq
437;
438; AVX2-LABEL: abd_minmax_v2i64:
439; AVX2:       # %bb.0:
440; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
441; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
442; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
443; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
444; AVX2-NEXT:    retq
445;
446; AVX512-LABEL: abd_minmax_v2i64:
447; AVX512:       # %bb.0:
448; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
449; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
450; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
451; AVX512-NEXT:    retq
452  %min = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
453  %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
454  %sub = sub <2 x i64> %max, %min
455  ret <2 x i64> %sub
456}
457
458;
459; select(icmp(a,b),sub(a,b),sub(b,a)) -> abds(a,b)
460;
461
462define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
463; SSE2-LABEL: abd_cmp_v16i8:
464; SSE2:       # %bb.0:
465; SSE2-NEXT:    movdqa %xmm0, %xmm2
466; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
467; SSE2-NEXT:    psubb %xmm1, %xmm0
468; SSE2-NEXT:    pxor %xmm2, %xmm0
469; SSE2-NEXT:    psubb %xmm0, %xmm2
470; SSE2-NEXT:    movdqa %xmm2, %xmm0
471; SSE2-NEXT:    retq
472;
473; SSE42-LABEL: abd_cmp_v16i8:
474; SSE42:       # %bb.0:
475; SSE42-NEXT:    movdqa %xmm0, %xmm2
476; SSE42-NEXT:    pminsb %xmm1, %xmm2
477; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
478; SSE42-NEXT:    psubb %xmm2, %xmm0
479; SSE42-NEXT:    retq
480;
481; AVX-LABEL: abd_cmp_v16i8:
482; AVX:       # %bb.0:
483; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
484; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
485; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
486; AVX-NEXT:    retq
487  %cmp = icmp sgt <16 x i8> %a, %b
488  %ab = sub <16 x i8> %a, %b
489  %ba = sub <16 x i8> %b, %a
490  %sel = select <16 x i1> %cmp, <16 x i8> %ab, <16 x i8> %ba
491  ret <16 x i8> %sel
492}
493
494define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
495; SSE-LABEL: abd_cmp_v8i16:
496; SSE:       # %bb.0:
497; SSE-NEXT:    movdqa %xmm0, %xmm2
498; SSE-NEXT:    pminsw %xmm1, %xmm2
499; SSE-NEXT:    pmaxsw %xmm1, %xmm0
500; SSE-NEXT:    psubw %xmm2, %xmm0
501; SSE-NEXT:    retq
502;
503; AVX-LABEL: abd_cmp_v8i16:
504; AVX:       # %bb.0:
505; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
506; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
507; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
508; AVX-NEXT:    retq
509  %cmp = icmp sge <8 x i16> %a, %b
510  %ab = sub <8 x i16> %a, %b
511  %ba = sub <8 x i16> %b, %a
512  %sel = select <8 x i1> %cmp, <8 x i16> %ab, <8 x i16> %ba
513  ret <8 x i16> %sel
514}
515
516define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
517; SSE2-LABEL: abd_cmp_v4i32:
518; SSE2:       # %bb.0:
519; SSE2-NEXT:    movdqa %xmm0, %xmm2
520; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
521; SSE2-NEXT:    psubd %xmm1, %xmm0
522; SSE2-NEXT:    pxor %xmm2, %xmm0
523; SSE2-NEXT:    psubd %xmm0, %xmm2
524; SSE2-NEXT:    movdqa %xmm2, %xmm0
525; SSE2-NEXT:    retq
526;
527; SSE42-LABEL: abd_cmp_v4i32:
528; SSE42:       # %bb.0:
529; SSE42-NEXT:    movdqa %xmm0, %xmm2
530; SSE42-NEXT:    pminsd %xmm1, %xmm2
531; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
532; SSE42-NEXT:    psubd %xmm2, %xmm0
533; SSE42-NEXT:    retq
534;
535; AVX-LABEL: abd_cmp_v4i32:
536; AVX:       # %bb.0:
537; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
538; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
539; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
540; AVX-NEXT:    retq
541  %cmp = icmp slt <4 x i32> %a, %b
542  %ab = sub <4 x i32> %a, %b
543  %ba = sub <4 x i32> %b, %a
544  %sel = select <4 x i1> %cmp, <4 x i32> %ba, <4 x i32> %ab
545  ret <4 x i32> %sel
546}
547
548define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
549; SSE2-LABEL: abd_cmp_v2i64:
550; SSE2:       # %bb.0:
551; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
552; SSE2-NEXT:    movdqa %xmm1, %xmm3
553; SSE2-NEXT:    pxor %xmm2, %xmm3
554; SSE2-NEXT:    pxor %xmm0, %xmm2
555; SSE2-NEXT:    movdqa %xmm2, %xmm4
556; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
557; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
558; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
559; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
560; SSE2-NEXT:    pand %xmm5, %xmm3
561; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
562; SSE2-NEXT:    por %xmm3, %xmm2
563; SSE2-NEXT:    psubq %xmm1, %xmm0
564; SSE2-NEXT:    pxor %xmm2, %xmm0
565; SSE2-NEXT:    psubq %xmm0, %xmm2
566; SSE2-NEXT:    movdqa %xmm2, %xmm0
567; SSE2-NEXT:    retq
568;
569; SSE42-LABEL: abd_cmp_v2i64:
570; SSE42:       # %bb.0:
571; SSE42-NEXT:    movdqa %xmm0, %xmm2
572; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
573; SSE42-NEXT:    psubq %xmm1, %xmm0
574; SSE42-NEXT:    pxor %xmm2, %xmm0
575; SSE42-NEXT:    psubq %xmm0, %xmm2
576; SSE42-NEXT:    movdqa %xmm2, %xmm0
577; SSE42-NEXT:    retq
578;
579; AVX1-LABEL: abd_cmp_v2i64:
580; AVX1:       # %bb.0:
581; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
582; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
583; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
584; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
585; AVX1-NEXT:    retq
586;
587; AVX2-LABEL: abd_cmp_v2i64:
588; AVX2:       # %bb.0:
589; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
590; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
591; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
592; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
593; AVX2-NEXT:    retq
594;
595; AVX512-LABEL: abd_cmp_v2i64:
596; AVX512:       # %bb.0:
597; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
598; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
599; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
600; AVX512-NEXT:    retq
601  %cmp = icmp sge <2 x i64> %a, %b
602  %ab = sub <2 x i64> %a, %b
603  %ba = sub <2 x i64> %b, %a
604  %sel = select <2 x i1> %cmp, <2 x i64> %ab, <2 x i64> %ba
605  ret <2 x i64> %sel
606}
607
608;
609; abs(sub_nsw(x, y)) -> abds(a,b)
610;
611
612define <16 x i8> @abd_subnsw_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
613; SSE2-LABEL: abd_subnsw_v16i8:
614; SSE2:       # %bb.0:
615; SSE2-NEXT:    psubb %xmm1, %xmm0
616; SSE2-NEXT:    pxor %xmm1, %xmm1
617; SSE2-NEXT:    psubb %xmm0, %xmm1
618; SSE2-NEXT:    pminub %xmm1, %xmm0
619; SSE2-NEXT:    retq
620;
621; SSE42-LABEL: abd_subnsw_v16i8:
622; SSE42:       # %bb.0:
623; SSE42-NEXT:    psubb %xmm1, %xmm0
624; SSE42-NEXT:    pabsb %xmm0, %xmm0
625; SSE42-NEXT:    retq
626;
627; AVX-LABEL: abd_subnsw_v16i8:
628; AVX:       # %bb.0:
629; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
630; AVX-NEXT:    vpabsb %xmm0, %xmm0
631; AVX-NEXT:    retq
632  %sub = sub nsw <16 x i8> %a, %b
633  %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 false)
634  ret <16 x i8> %abs
635}
636
637define <8 x i16> @abd_subnsw_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
638; SSE2-LABEL: abd_subnsw_v8i16:
639; SSE2:       # %bb.0:
640; SSE2-NEXT:    psubw %xmm1, %xmm0
641; SSE2-NEXT:    pxor %xmm1, %xmm1
642; SSE2-NEXT:    psubw %xmm0, %xmm1
643; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
644; SSE2-NEXT:    retq
645;
646; SSE42-LABEL: abd_subnsw_v8i16:
647; SSE42:       # %bb.0:
648; SSE42-NEXT:    psubw %xmm1, %xmm0
649; SSE42-NEXT:    pabsw %xmm0, %xmm0
650; SSE42-NEXT:    retq
651;
652; AVX-LABEL: abd_subnsw_v8i16:
653; AVX:       # %bb.0:
654; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
655; AVX-NEXT:    vpabsw %xmm0, %xmm0
656; AVX-NEXT:    retq
657  %sub = sub nsw <8 x i16> %a, %b
658  %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 false)
659  ret <8 x i16> %abs
660}
661
662define <4 x i32> @abd_subnsw_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
663; SSE2-LABEL: abd_subnsw_v4i32:
664; SSE2:       # %bb.0:
665; SSE2-NEXT:    psubd %xmm1, %xmm0
666; SSE2-NEXT:    movdqa %xmm0, %xmm1
667; SSE2-NEXT:    psrad $31, %xmm1
668; SSE2-NEXT:    pxor %xmm1, %xmm0
669; SSE2-NEXT:    psubd %xmm1, %xmm0
670; SSE2-NEXT:    retq
671;
672; SSE42-LABEL: abd_subnsw_v4i32:
673; SSE42:       # %bb.0:
674; SSE42-NEXT:    psubd %xmm1, %xmm0
675; SSE42-NEXT:    pabsd %xmm0, %xmm0
676; SSE42-NEXT:    retq
677;
678; AVX-LABEL: abd_subnsw_v4i32:
679; AVX:       # %bb.0:
680; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
681; AVX-NEXT:    vpabsd %xmm0, %xmm0
682; AVX-NEXT:    retq
683  %sub = sub nsw <4 x i32> %a, %b
684  %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 false)
685  ret <4 x i32> %abs
686}
687
688define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
689; SSE2-LABEL: abd_subnsw_v2i64:
690; SSE2:       # %bb.0:
691; SSE2-NEXT:    psubq %xmm1, %xmm0
692; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
693; SSE2-NEXT:    psrad $31, %xmm1
694; SSE2-NEXT:    pxor %xmm1, %xmm0
695; SSE2-NEXT:    psubq %xmm1, %xmm0
696; SSE2-NEXT:    retq
697;
698; SSE42-LABEL: abd_subnsw_v2i64:
699; SSE42:       # %bb.0:
700; SSE42-NEXT:    psubq %xmm1, %xmm0
701; SSE42-NEXT:    pxor %xmm1, %xmm1
702; SSE42-NEXT:    psubq %xmm0, %xmm1
703; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm0
704; SSE42-NEXT:    retq
705;
706; AVX1-LABEL: abd_subnsw_v2i64:
707; AVX1:       # %bb.0:
708; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
709; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
710; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
711; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
712; AVX1-NEXT:    retq
713;
714; AVX2-LABEL: abd_subnsw_v2i64:
715; AVX2:       # %bb.0:
716; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
717; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
718; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
719; AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
720; AVX2-NEXT:    retq
721;
722; AVX512-LABEL: abd_subnsw_v2i64:
723; AVX512:       # %bb.0:
724; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
725; AVX512-NEXT:    vpabsq %xmm0, %xmm0
726; AVX512-NEXT:    retq
727  %sub = sub nsw <2 x i64> %a, %b
728  %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 false)
729  ret <2 x i64> %abs
730}
731
732;
733; Special cases
734;
735
736define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwind {
737; SSE2-LABEL: abd_cmp_v2i64_multiuse_cmp:
738; SSE2:       # %bb.0:
739; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
740; SSE2-NEXT:    movdqa %xmm1, %xmm3
741; SSE2-NEXT:    pxor %xmm2, %xmm3
742; SSE2-NEXT:    pxor %xmm0, %xmm2
743; SSE2-NEXT:    movdqa %xmm2, %xmm4
744; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
745; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
746; SSE2-NEXT:    movdqa %xmm2, %xmm6
747; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
748; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
749; SSE2-NEXT:    pand %xmm6, %xmm5
750; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
751; SSE2-NEXT:    por %xmm5, %xmm4
752; SSE2-NEXT:    psubq %xmm1, %xmm0
753; SSE2-NEXT:    pxor %xmm4, %xmm0
754; SSE2-NEXT:    psubq %xmm0, %xmm4
755; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
756; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
757; SSE2-NEXT:    pand %xmm6, %xmm0
758; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
759; SSE2-NEXT:    por %xmm0, %xmm1
760; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
761; SSE2-NEXT:    pxor %xmm1, %xmm0
762; SSE2-NEXT:    paddq %xmm4, %xmm0
763; SSE2-NEXT:    retq
764;
765; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
766; SSE42:       # %bb.0:
767; SSE42-NEXT:    movdqa %xmm0, %xmm2
768; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
769; SSE42-NEXT:    movdqa %xmm0, %xmm3
770; SSE42-NEXT:    psubq %xmm1, %xmm3
771; SSE42-NEXT:    pxor %xmm2, %xmm3
772; SSE42-NEXT:    psubq %xmm3, %xmm2
773; SSE42-NEXT:    pcmpgtq %xmm0, %xmm1
774; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
775; SSE42-NEXT:    pxor %xmm1, %xmm0
776; SSE42-NEXT:    paddq %xmm2, %xmm0
777; SSE42-NEXT:    retq
778;
779; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
780; AVX1:       # %bb.0:
781; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
782; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
783; AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
784; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
785; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
786; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
787; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
788; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
789; AVX1-NEXT:    retq
790;
791; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
792; AVX2:       # %bb.0:
793; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
794; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
795; AVX2-NEXT:    vpxor %xmm2, %xmm3, %xmm3
796; AVX2-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
797; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
798; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
799; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
800; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
801; AVX2-NEXT:    retq
802;
803; AVX512-LABEL: abd_cmp_v2i64_multiuse_cmp:
804; AVX512:       # %bb.0:
805; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
806; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm3
807; AVX512-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
808; AVX512-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
809; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
810; AVX512-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
811; AVX512-NEXT:    retq
812  %cmp = icmp sge <2 x i64> %a, %b
813  %ab = sub <2 x i64> %a, %b
814  %ba = sub <2 x i64> %b, %a
815  %sel = select <2 x i1> %cmp, <2 x i64> %ab, <2 x i64> %ba
816  %ext = sext <2 x i1> %cmp to <2 x i64>
817  %res = add <2 x i64> %ext, %sel
818  ret <2 x i64> %res
819}
820
821define <8 x i16> @abd_cmp_v8i16_multiuse_sub(<8 x i16> %a, <8 x i16> %b) nounwind {
822; SSE-LABEL: abd_cmp_v8i16_multiuse_sub:
823; SSE:       # %bb.0:
824; SSE-NEXT:    movdqa %xmm0, %xmm2
825; SSE-NEXT:    psubw %xmm1, %xmm2
826; SSE-NEXT:    movdqa %xmm0, %xmm3
827; SSE-NEXT:    pminsw %xmm1, %xmm3
828; SSE-NEXT:    pmaxsw %xmm1, %xmm0
829; SSE-NEXT:    psubw %xmm3, %xmm0
830; SSE-NEXT:    paddw %xmm2, %xmm0
831; SSE-NEXT:    retq
832;
833; AVX-LABEL: abd_cmp_v8i16_multiuse_sub:
834; AVX:       # %bb.0:
835; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
836; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
837; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
838; AVX-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
839; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
840; AVX-NEXT:    retq
841  %cmp = icmp sgt <8 x i16> %a, %b
842  %ab = sub <8 x i16> %a, %b
843  %ba = sub <8 x i16> %b, %a
844  %sel = select <8 x i1> %cmp, <8 x i16> %ab, <8 x i16> %ba
845  %res = add <8 x i16> %ab, %sel
846  ret <8 x i16> %res
847}
848
849declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
850declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
851declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
852declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
853declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
854declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
855declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
856declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1)
857
858declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>)
859declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
860declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
861declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>)
862
863declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>)
864declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
865declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
866declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
867