1*86d7f5d3SJohn Marino /* mpn_divexact_by3c -- mpn exact division by 3.
2*86d7f5d3SJohn Marino
3*86d7f5d3SJohn Marino Copyright 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
4*86d7f5d3SJohn Marino
5*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
6*86d7f5d3SJohn Marino
7*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
8*86d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
9*86d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
10*86d7f5d3SJohn Marino option) any later version.
11*86d7f5d3SJohn Marino
12*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
13*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14*86d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15*86d7f5d3SJohn Marino License for more details.
16*86d7f5d3SJohn Marino
17*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
18*86d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
19*86d7f5d3SJohn Marino
20*86d7f5d3SJohn Marino #include "gmp.h"
21*86d7f5d3SJohn Marino #include "gmp-impl.h"
22*86d7f5d3SJohn Marino
23*86d7f5d3SJohn Marino #if DIVEXACT_BY3_METHOD == 0
24*86d7f5d3SJohn Marino
25*86d7f5d3SJohn Marino mp_limb_t
mpn_divexact_by3c(mp_ptr rp,mp_srcptr up,mp_size_t un,mp_limb_t c)26*86d7f5d3SJohn Marino mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
27*86d7f5d3SJohn Marino {
28*86d7f5d3SJohn Marino mp_limb_t r;
29*86d7f5d3SJohn Marino r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
30*86d7f5d3SJohn Marino
31*86d7f5d3SJohn Marino /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
32*86d7f5d3SJohn Marino We want to return C. We compute the remainder mod 4 and notice that the
33*86d7f5d3SJohn Marino inverse of (2^(2k)-1)/3 mod 4 is 1. */
34*86d7f5d3SJohn Marino return r & 3;
35*86d7f5d3SJohn Marino }
36*86d7f5d3SJohn Marino
37*86d7f5d3SJohn Marino #endif
38*86d7f5d3SJohn Marino
39*86d7f5d3SJohn Marino #if DIVEXACT_BY3_METHOD == 1
40*86d7f5d3SJohn Marino
41*86d7f5d3SJohn Marino /* The algorithm here is basically the same as mpn_divexact_1, as described
42*86d7f5d3SJohn Marino in the manual. Namely at each step q = (src[i]-c)*inverse, and new c =
43*86d7f5d3SJohn Marino borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3,
44*86d7f5d3SJohn Marino high(divisor*q) can be determined with two comparisons instead of a
45*86d7f5d3SJohn Marino multiply.
46*86d7f5d3SJohn Marino
47*86d7f5d3SJohn Marino The "c += ..."s add the high limb of 3*l to c. That high limb will be 0,
48*86d7f5d3SJohn Marino 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of
49*86d7f5d3SJohn Marino 2.95.2 at least).
50*86d7f5d3SJohn Marino
51*86d7f5d3SJohn Marino It will be noted that the new c is formed by adding three values each 0
52*86d7f5d3SJohn Marino or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c
53*86d7f5d3SJohn Marino causes a borrow, that leaves a limb value of either 0xFF...FF or
54*86d7f5d3SJohn Marino 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
55*86d7f5d3SJohn Marino 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
56*86d7f5d3SJohn Marino respectively, hence a total of no more than 2.
57*86d7f5d3SJohn Marino
58*86d7f5d3SJohn Marino Alternatives:
59*86d7f5d3SJohn Marino
60*86d7f5d3SJohn Marino This implementation has each multiply on the dependent chain, due to
61*86d7f5d3SJohn Marino "l=s-c". See below for alternative code which avoids that. */
62*86d7f5d3SJohn Marino
63*86d7f5d3SJohn Marino mp_limb_t
mpn_divexact_by3c(mp_ptr restrict rp,mp_srcptr restrict up,mp_size_t un,mp_limb_t c)64*86d7f5d3SJohn Marino mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
65*86d7f5d3SJohn Marino {
66*86d7f5d3SJohn Marino mp_limb_t l, q, s;
67*86d7f5d3SJohn Marino mp_size_t i;
68*86d7f5d3SJohn Marino
69*86d7f5d3SJohn Marino ASSERT (un >= 1);
70*86d7f5d3SJohn Marino ASSERT (c == 0 || c == 1 || c == 2);
71*86d7f5d3SJohn Marino ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
72*86d7f5d3SJohn Marino
73*86d7f5d3SJohn Marino i = 0;
74*86d7f5d3SJohn Marino do
75*86d7f5d3SJohn Marino {
76*86d7f5d3SJohn Marino s = up[i];
77*86d7f5d3SJohn Marino SUBC_LIMB (c, l, s, c);
78*86d7f5d3SJohn Marino
79*86d7f5d3SJohn Marino q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
80*86d7f5d3SJohn Marino rp[i] = q;
81*86d7f5d3SJohn Marino
82*86d7f5d3SJohn Marino c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
83*86d7f5d3SJohn Marino c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
84*86d7f5d3SJohn Marino }
85*86d7f5d3SJohn Marino while (++i < un);
86*86d7f5d3SJohn Marino
87*86d7f5d3SJohn Marino ASSERT (c == 0 || c == 1 || c == 2);
88*86d7f5d3SJohn Marino return c;
89*86d7f5d3SJohn Marino }
90*86d7f5d3SJohn Marino
91*86d7f5d3SJohn Marino
92*86d7f5d3SJohn Marino #endif
93*86d7f5d3SJohn Marino
94*86d7f5d3SJohn Marino #if DIVEXACT_BY3_METHOD == 2
95*86d7f5d3SJohn Marino
96*86d7f5d3SJohn Marino /* The following alternative code re-arranges the quotient calculation from
97*86d7f5d3SJohn Marino (src[i]-c)*inverse to instead
98*86d7f5d3SJohn Marino
99*86d7f5d3SJohn Marino q = src[i]*inverse - c*inverse
100*86d7f5d3SJohn Marino
101*86d7f5d3SJohn Marino thereby allowing src[i]*inverse to be scheduled back as far as desired,
102*86d7f5d3SJohn Marino making full use of multiplier throughput and leaving just some carry
103*86d7f5d3SJohn Marino handing on the dependent chain.
104*86d7f5d3SJohn Marino
105*86d7f5d3SJohn Marino The carry handling consists of determining the c for the next iteration.
106*86d7f5d3SJohn Marino This is the same as described above, namely look for any borrow from
107*86d7f5d3SJohn Marino src[i]-c, and at the high of 3*q.
108*86d7f5d3SJohn Marino
109*86d7f5d3SJohn Marino high(3*q) is done with two comparisons as above (in c2 and c3). The
110*86d7f5d3SJohn Marino borrow from src[i]-c is incorporated into those by noting that if there's
111*86d7f5d3SJohn Marino a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
112*86d7f5d3SJohn Marino giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is
113*86d7f5d3SJohn Marino enough to make high(3*q) come out 1 bigger, as required.
114*86d7f5d3SJohn Marino
115*86d7f5d3SJohn Marino l = -c*inverse is calculated at the same time as c, since for most chips
116*86d7f5d3SJohn Marino it can be more conveniently derived from separate c1/c2/c3 values than
117*86d7f5d3SJohn Marino from a combined c equal to 0, 1 or 2.
118*86d7f5d3SJohn Marino
119*86d7f5d3SJohn Marino The net effect is that with good pipelining this loop should be able to
120*86d7f5d3SJohn Marino run at perhaps 4 cycles/limb, depending on available execute resources
121*86d7f5d3SJohn Marino etc.
122*86d7f5d3SJohn Marino
123*86d7f5d3SJohn Marino Usage:
124*86d7f5d3SJohn Marino
125*86d7f5d3SJohn Marino This code is not used by default, since we really can't rely on the
126*86d7f5d3SJohn Marino compiler generating a good software pipeline, nor on such an approach
127*86d7f5d3SJohn Marino even being worthwhile on all CPUs.
128*86d7f5d3SJohn Marino
129*86d7f5d3SJohn Marino Itanium is one chip where this algorithm helps though, see
130*86d7f5d3SJohn Marino mpn/ia64/diveby3.asm. */
131*86d7f5d3SJohn Marino
132*86d7f5d3SJohn Marino mp_limb_t
mpn_divexact_by3c(mp_ptr restrict rp,mp_srcptr restrict up,mp_size_t un,mp_limb_t cy)133*86d7f5d3SJohn Marino mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
134*86d7f5d3SJohn Marino {
135*86d7f5d3SJohn Marino mp_limb_t s, sm, cl, q, qx, c2, c3;
136*86d7f5d3SJohn Marino mp_size_t i;
137*86d7f5d3SJohn Marino
138*86d7f5d3SJohn Marino ASSERT (un >= 1);
139*86d7f5d3SJohn Marino ASSERT (cy == 0 || cy == 1 || cy == 2);
140*86d7f5d3SJohn Marino ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
141*86d7f5d3SJohn Marino
142*86d7f5d3SJohn Marino cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
143*86d7f5d3SJohn Marino
144*86d7f5d3SJohn Marino for (i = 0; i < un; i++)
145*86d7f5d3SJohn Marino {
146*86d7f5d3SJohn Marino s = up[i];
147*86d7f5d3SJohn Marino sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
148*86d7f5d3SJohn Marino
149*86d7f5d3SJohn Marino q = (cl + sm) & GMP_NUMB_MASK;
150*86d7f5d3SJohn Marino rp[i] = q;
151*86d7f5d3SJohn Marino qx = q + (s < cy);
152*86d7f5d3SJohn Marino
153*86d7f5d3SJohn Marino c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
154*86d7f5d3SJohn Marino c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
155*86d7f5d3SJohn Marino
156*86d7f5d3SJohn Marino cy = c2 + c3;
157*86d7f5d3SJohn Marino cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
158*86d7f5d3SJohn Marino }
159*86d7f5d3SJohn Marino
160*86d7f5d3SJohn Marino return cy;
161*86d7f5d3SJohn Marino }
162*86d7f5d3SJohn Marino
163*86d7f5d3SJohn Marino #endif
164