xref: /llvm-project/llvm/test/CodeGen/X86/combine-sub-usat.ll (revision be6c752e157638849f1f59f7e2b7ecbe11a022fe)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
9
10declare  i32 @llvm.usub.sat.i32  (i32, i32)
11declare  i64 @llvm.usub.sat.i64  (i64, i64)
12declare  <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>)
13declare  <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>)
14
15; fold (usub_sat x, undef) -> 0
16define i32 @combine_undef_i32(i32 %a0) {
17; CHECK-LABEL: combine_undef_i32:
18; CHECK:       # %bb.0:
19; CHECK-NEXT:    xorl %eax, %eax
20; CHECK-NEXT:    retq
21  %res = call i32 @llvm.usub.sat.i32(i32 %a0, i32 undef)
22  ret i32 %res
23}
24
25define <8 x i16> @combine_undef_v8i16(<8 x i16> %a0) {
26; SSE-LABEL: combine_undef_v8i16:
27; SSE:       # %bb.0:
28; SSE-NEXT:    xorps %xmm0, %xmm0
29; SSE-NEXT:    retq
30;
31; AVX-LABEL: combine_undef_v8i16:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
34; AVX-NEXT:    retq
35  %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> %a0)
36  ret <8 x i16> %res
37}
38
39; fold (usub_sat c1, c2) -> c3
40define i32 @combine_constfold_i32() {
41; CHECK-LABEL: combine_constfold_i32:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    xorl %eax, %eax
44; CHECK-NEXT:    retq
45  %res = call i32 @llvm.usub.sat.i32(i32 100, i32 4294967295)
46  ret i32 %res
47}
48
49define <8 x i16> @combine_constfold_v8i16() {
50; SSE-LABEL: combine_constfold_v8i16:
51; SSE:       # %bb.0:
52; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
53; SSE-NEXT:    retq
54;
55; AVX1-LABEL: combine_constfold_v8i16:
56; AVX1:       # %bb.0:
57; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
58; AVX1-NEXT:    retq
59;
60; AVX2-LABEL: combine_constfold_v8i16:
61; AVX2:       # %bb.0:
62; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0]
63; AVX2-NEXT:    retq
64;
65; AVX512-LABEL: combine_constfold_v8i16:
66; AVX512:       # %bb.0:
67; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = [0,254,65534,0]
68; AVX512-NEXT:    retq
69  %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 0, i16 1, i16 255, i16 65535, i16 -1, i16 -255, i16 -65535, i16 1>, <8 x i16> <i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535>)
70  ret <8 x i16> %res
71}
72
73define <8 x i16> @combine_constfold_undef_v8i16() {
74; SSE-LABEL: combine_constfold_undef_v8i16:
75; SSE:       # %bb.0:
76; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
77; SSE-NEXT:    retq
78;
79; AVX1-LABEL: combine_constfold_undef_v8i16:
80; AVX1:       # %bb.0:
81; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
82; AVX1-NEXT:    retq
83;
84; AVX2-LABEL: combine_constfold_undef_v8i16:
85; AVX2:       # %bb.0:
86; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
87; AVX2-NEXT:    retq
88;
89; AVX512-LABEL: combine_constfold_undef_v8i16:
90; AVX512:       # %bb.0:
91; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = [0,65534]
92; AVX512-NEXT:    retq
93  %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -65535, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 1, i16 65535>)
94  ret <8 x i16> %res
95}
96
97; fold (usub_sat x, 0) -> x
98define i32 @combine_zero_i32(i32 %a0) {
99; CHECK-LABEL: combine_zero_i32:
100; CHECK:       # %bb.0:
101; CHECK-NEXT:    movl %edi, %eax
102; CHECK-NEXT:    retq
103  %1 = call i32 @llvm.usub.sat.i32(i32 %a0, i32 0)
104  ret i32 %1
105}
106
107define <8 x i16> @combine_zero_v8i16(<8 x i16> %a0) {
108; CHECK-LABEL: combine_zero_v8i16:
109; CHECK:       # %bb.0:
110; CHECK-NEXT:    retq
111  %1 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> zeroinitializer)
112  ret <8 x i16> %1
113}
114
115; fold (usub_sat x, x) -> 0
116define i32 @combine_self_i32(i32 %a0) {
117; CHECK-LABEL: combine_self_i32:
118; CHECK:       # %bb.0:
119; CHECK-NEXT:    xorl %eax, %eax
120; CHECK-NEXT:    retq
121  %1 = call i32 @llvm.usub.sat.i32(i32 %a0, i32 %a0)
122  ret i32 %1
123}
124
125define <8 x i16> @combine_self_v8i16(<8 x i16> %a0) {
126; SSE-LABEL: combine_self_v8i16:
127; SSE:       # %bb.0:
128; SSE-NEXT:    xorps %xmm0, %xmm0
129; SSE-NEXT:    retq
130;
131; AVX-LABEL: combine_self_v8i16:
132; AVX:       # %bb.0:
133; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
134; AVX-NEXT:    retq
135  %1 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a0)
136  ret <8 x i16> %1
137}
138
139; fold (usub_sat x, y) -> (sub x, y) iff no overflow
140define i32 @combine_no_overflow_i32(i32 %a0, i32 %a1) {
141; CHECK-LABEL: combine_no_overflow_i32:
142; CHECK:       # %bb.0:
143; CHECK-NEXT:    shrl $16, %edi
144; CHECK-NEXT:    shrl $16, %esi
145; CHECK-NEXT:    xorl %eax, %eax
146; CHECK-NEXT:    subl %esi, %edi
147; CHECK-NEXT:    cmovael %edi, %eax
148; CHECK-NEXT:    retq
149  %1 = lshr i32 %a0, 16
150  %2 = lshr i32 %a1, 16
151  %3 = call i32 @llvm.usub.sat.i32(i32 %1, i32 %2)
152  ret i32 %3
153}
154
155define <8 x i16> @combine_no_overflow_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
156; SSE-LABEL: combine_no_overflow_v8i16:
157; SSE:       # %bb.0:
158; SSE-NEXT:    psrlw $10, %xmm0
159; SSE-NEXT:    psrlw $10, %xmm1
160; SSE-NEXT:    psubusw %xmm1, %xmm0
161; SSE-NEXT:    retq
162;
163; AVX-LABEL: combine_no_overflow_v8i16:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vpsrlw $10, %xmm0, %xmm0
166; AVX-NEXT:    vpsrlw $10, %xmm1, %xmm1
167; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
168; AVX-NEXT:    retq
169  %1 = lshr <8 x i16> %a0, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
170  %2 = lshr <8 x i16> %a1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
171  %3 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %1, <8 x i16> %2)
172  ret <8 x i16> %3
173}
174
175; FIXME: fold (trunc (usub_sat zext(x), y)) -> usub_sat(x, trunc(umin(y,satlimit)))
176define i16 @combine_trunc_i32_i16(i16 %a0, i32 %a1) {
177; CHECK-LABEL: combine_trunc_i32_i16:
178; CHECK:       # %bb.0:
179; CHECK-NEXT:    movzwl %di, %eax
180; CHECK-NEXT:    xorl %ecx, %ecx
181; CHECK-NEXT:    subl %esi, %eax
182; CHECK-NEXT:    cmovbl %ecx, %eax
183; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
184; CHECK-NEXT:    retq
185  %1 = zext i16 %a0 to i32
186  %2 = call i32 @llvm.usub.sat.i32(i32 %1, i32 %a1)
187  %3 = trunc i32 %2 to i16
188  ret i16 %3
189}
190
191define <8 x i8> @combine_trunc_v8i16_v8i8(<8 x i8> %a0, <8 x i16> %a1) {
192; SSE2-LABEL: combine_trunc_v8i16_v8i8:
193; SSE2:       # %bb.0:
194; SSE2-NEXT:    pxor %xmm2, %xmm2
195; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
196; SSE2-NEXT:    psubusw %xmm1, %xmm0
197; SSE2-NEXT:    packuswb %xmm0, %xmm0
198; SSE2-NEXT:    retq
199;
200; SSE41-LABEL: combine_trunc_v8i16_v8i8:
201; SSE41:       # %bb.0:
202; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
203; SSE41-NEXT:    psubusw %xmm1, %xmm0
204; SSE41-NEXT:    packuswb %xmm0, %xmm0
205; SSE41-NEXT:    retq
206;
207; SSE42-LABEL: combine_trunc_v8i16_v8i8:
208; SSE42:       # %bb.0:
209; SSE42-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
210; SSE42-NEXT:    psubusw %xmm1, %xmm0
211; SSE42-NEXT:    packuswb %xmm0, %xmm0
212; SSE42-NEXT:    retq
213;
214; AVX-LABEL: combine_trunc_v8i16_v8i8:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
217; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
218; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
219; AVX-NEXT:    retq
220  %1 = zext <8 x i8> %a0 to <8 x i16>
221  %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %1, <8 x i16> %a1)
222  %3 = trunc <8 x i16> %2 to <8 x i8>
223  ret <8 x i8> %3
224}
225
226define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) {
227; SSE2-LABEL: combine_trunc_v8i32_v8i16:
228; SSE2:       # %bb.0:
229; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
230; SSE2-NEXT:    movdqa %xmm2, %xmm4
231; SSE2-NEXT:    pxor %xmm3, %xmm4
232; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
233; SSE2-NEXT:    movdqa %xmm5, %xmm6
234; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
235; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
236; SSE2-NEXT:    pand %xmm6, %xmm2
237; SSE2-NEXT:    pxor %xmm4, %xmm6
238; SSE2-NEXT:    por %xmm2, %xmm6
239; SSE2-NEXT:    pslld $16, %xmm6
240; SSE2-NEXT:    psrad $16, %xmm6
241; SSE2-NEXT:    pxor %xmm1, %xmm3
242; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
243; SSE2-NEXT:    pxor %xmm5, %xmm4
244; SSE2-NEXT:    pand %xmm1, %xmm5
245; SSE2-NEXT:    por %xmm4, %xmm5
246; SSE2-NEXT:    pslld $16, %xmm5
247; SSE2-NEXT:    psrad $16, %xmm5
248; SSE2-NEXT:    packssdw %xmm6, %xmm5
249; SSE2-NEXT:    psubusw %xmm5, %xmm0
250; SSE2-NEXT:    retq
251;
252; SSE41-LABEL: combine_trunc_v8i32_v8i16:
253; SSE41:       # %bb.0:
254; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
255; SSE41-NEXT:    pminud %xmm3, %xmm2
256; SSE41-NEXT:    pminud %xmm3, %xmm1
257; SSE41-NEXT:    packusdw %xmm2, %xmm1
258; SSE41-NEXT:    psubusw %xmm1, %xmm0
259; SSE41-NEXT:    retq
260;
261; SSE42-LABEL: combine_trunc_v8i32_v8i16:
262; SSE42:       # %bb.0:
263; SSE42-NEXT:    pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
264; SSE42-NEXT:    pminud %xmm3, %xmm2
265; SSE42-NEXT:    pminud %xmm3, %xmm1
266; SSE42-NEXT:    packusdw %xmm2, %xmm1
267; SSE42-NEXT:    psubusw %xmm1, %xmm0
268; SSE42-NEXT:    retq
269;
270; AVX1-LABEL: combine_trunc_v8i32_v8i16:
271; AVX1:       # %bb.0:
272; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
273; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
274; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm2
275; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
276; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
277; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
278; AVX1-NEXT:    vzeroupper
279; AVX1-NEXT:    retq
280;
281; AVX2-LABEL: combine_trunc_v8i32_v8i16:
282; AVX2:       # %bb.0:
283; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
284; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
285; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
286; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
287; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
288; AVX2-NEXT:    vzeroupper
289; AVX2-NEXT:    retq
290;
291; AVX512-LABEL: combine_trunc_v8i32_v8i16:
292; AVX512:       # %bb.0:
293; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
294; AVX512-NEXT:    vpmovusdw %zmm1, %ymm1
295; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
296; AVX512-NEXT:    vzeroupper
297; AVX512-NEXT:    retq
298  %1 = zext <8 x i16> %a0 to <8 x i32>
299  %2 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %1, <8 x i32> %a1)
300  %3 = trunc <8 x i32> %2 to <8 x i16>
301  ret <8 x i16> %3
302}
303
304; fold (usub_sat (shuffle x, u, m), (shuffle y, u, m)) -> (shuffle (usub_sat x, y), u, m)
305define <8 x i16> @combine_shuffle_shuffle_v8i16(<8 x i16> %x0, <8 x i16> %y0) {
306; SSE-LABEL: combine_shuffle_shuffle_v8i16:
307; SSE:       # %bb.0:
308; SSE-NEXT:    psubusw %xmm1, %xmm0
309; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
310; SSE-NEXT:    retq
311;
312; AVX-LABEL: combine_shuffle_shuffle_v8i16:
313; AVX:       # %bb.0:
314; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
315; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
316; AVX-NEXT:    retq
317  %x1= shufflevector <8 x i16> %x0, <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
318  %y1 = shufflevector <8 x i16> %y0, <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
319  %res = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %x1, <8 x i16> %y1)
320  ret <8 x i16> %res
321}
322