xref: /llvm-project/llvm/test/CodeGen/X86/combine-mulo.ll (revision 741c1278175b9354442cd2143e1452714dc020a2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX
4
5declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
6declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
7
8declare {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
9declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
10
11; fold (smulo x, 2) -> (saddo x, x)
12define i32 @combine_smul_two(i32 %a0, i32 %a1) {
13; CHECK-LABEL: combine_smul_two:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    movl %edi, %eax
16; CHECK-NEXT:    addl %edi, %eax
17; CHECK-NEXT:    cmovol %esi, %eax
18; CHECK-NEXT:    retq
19  %1 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a0, i32 2)
20  %2 = extractvalue {i32, i1} %1, 0
21  %3 = extractvalue {i32, i1} %1, 1
22  %4 = select i1 %3, i32 %a1, i32 %2
23  ret i32 %4
24}
25
26define <4 x i32> @combine_vec_smul_two(<4 x i32> %a0, <4 x i32> %a1) {
27; SSE-LABEL: combine_vec_smul_two:
28; SSE:       # %bb.0:
29; SSE-NEXT:    movdqa %xmm0, %xmm2
30; SSE-NEXT:    paddd %xmm0, %xmm2
31; SSE-NEXT:    movdqa %xmm0, %xmm3
32; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
33; SSE-NEXT:    pxor %xmm3, %xmm0
34; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm2
35; SSE-NEXT:    movaps %xmm2, %xmm0
36; SSE-NEXT:    retq
37;
38; AVX-LABEL: combine_vec_smul_two:
39; AVX:       # %bb.0:
40; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
41; AVX-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm3
42; AVX-NEXT:    vpxor %xmm3, %xmm0, %xmm0
43; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
44; AVX-NEXT:    retq
45  %1 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
46  %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
47  %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
48  %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
49  ret <4 x i32> %4
50}
51
52; fold (umulo x, 2) -> (uaddo x, x)
53define i32 @combine_umul_two(i32 %a0, i32 %a1) {
54; CHECK-LABEL: combine_umul_two:
55; CHECK:       # %bb.0:
56; CHECK-NEXT:    movl %edi, %eax
57; CHECK-NEXT:    addl %edi, %eax
58; CHECK-NEXT:    cmovbl %esi, %eax
59; CHECK-NEXT:    retq
60  %1 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a0, i32 2)
61  %2 = extractvalue {i32, i1} %1, 0
62  %3 = extractvalue {i32, i1} %1, 1
63  %4 = select i1 %3, i32 %a1, i32 %2
64  ret i32 %4
65}
66
67define <4 x i32> @combine_vec_umul_two(<4 x i32> %a0, <4 x i32> %a1) {
68; SSE-LABEL: combine_vec_umul_two:
69; SSE:       # %bb.0:
70; SSE-NEXT:    movdqa %xmm0, %xmm2
71; SSE-NEXT:    paddd %xmm0, %xmm2
72; SSE-NEXT:    pmaxud %xmm2, %xmm0
73; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
74; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
75; SSE-NEXT:    movaps %xmm1, %xmm0
76; SSE-NEXT:    retq
77;
78; AVX-LABEL: combine_vec_umul_two:
79; AVX:       # %bb.0:
80; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
81; AVX-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
82; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
83; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
84; AVX-NEXT:    retq
85  %1 = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
86  %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
87  %3 = extractvalue {<4 x i32>, <4 x i1>} %1, 1
88  %4 = select <4 x i1> %3, <4 x i32> %a1, <4 x i32> %2
89  ret <4 x i32> %4
90}
91
92define { i32, i1 } @combine_smul_nsw(i32 %a, i32 %b) {
93; CHECK-LABEL: combine_smul_nsw:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    movl %esi, %eax
96; CHECK-NEXT:    andl $4095, %edi # imm = 0xFFF
97; CHECK-NEXT:    andl $524287, %eax # imm = 0x7FFFF
98; CHECK-NEXT:    imull %edi, %eax
99; CHECK-NEXT:    xorl %edx, %edx
100; CHECK-NEXT:    retq
101  %aa = and i32 %a, 4095 ; 0xfff
102  %bb = and i32 %b, 524287; 0x7ffff
103  %x = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %aa, i32 %bb)
104  ret { i32, i1 } %x
105}
106
107define { <4 x i32>, <4 x i1> } @combine_vec_smul_nsw(<4 x i32> %a, <4 x i32> %b) {
108; SSE-LABEL: combine_vec_smul_nsw:
109; SSE:       # %bb.0:
110; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
111; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
112; SSE-NEXT:    pmulld %xmm1, %xmm0
113; SSE-NEXT:    pxor %xmm1, %xmm1
114; SSE-NEXT:    retq
115;
116; AVX-LABEL: combine_vec_smul_nsw:
117; AVX:       # %bb.0:
118; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4095,4095,4095,4095]
119; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
120; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [524287,524287,524287,524287]
121; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
122; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
123; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
124; AVX-NEXT:    retq
125  %aa = and <4 x i32> %a, <i32 4095, i32 4095, i32 4095, i32 4095>
126  %bb = and <4 x i32> %b, <i32 524287, i32 524287, i32 524287, i32 524287>
127  %x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %aa, <4 x i32> %bb)
128  ret { <4 x i32>, <4 x i1> } %x
129}
130