xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/diveby3.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_divexact_by3c -- mpn exact division by 3.
2 
3 Copyright 2000-2003, 2008 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MP Library.
6 
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9 
10   * the GNU Lesser General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13 
14 or
15 
16   * the GNU General Public License as published by the Free Software
17     Foundation; either version 2 of the License, or (at your option) any
18     later version.
19 
20 or both in parallel, as here.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 for more details.
26 
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library.  If not,
29 see https://www.gnu.org/licenses/.  */
30 
31 #include "gmp-impl.h"
32 
33 #if DIVEXACT_BY3_METHOD == 0
34 
35 mp_limb_t
mpn_divexact_by3c(mp_ptr rp,mp_srcptr up,mp_size_t un,mp_limb_t c)36 mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
37 {
38   mp_limb_t r;
39   r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
40 
41   /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
42      We want to return C.  We compute the remainder mod 4 and notice that the
43      inverse of (2^(2k)-1)/3 mod 4 is 1.  */
44   return r & 3;
45 }
46 
47 #endif
48 
49 #if DIVEXACT_BY3_METHOD == 1
50 
51 /* The algorithm here is basically the same as mpn_divexact_1, as described
52    in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
53    borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
54    high(divisor*q) can be determined with two comparisons instead of a
55    multiply.
56 
57    The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
58    1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
59    2.95.2 at least).
60 
61    It will be noted that the new c is formed by adding three values each 0
62    or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
63    causes a borrow, that leaves a limb value of either 0xFF...FF or
64    0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
65    0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
66    respectively, hence a total of no more than 2.
67 
68    Alternatives:
69 
70    This implementation has each multiply on the dependent chain, due to
71    "l=s-c".  See below for alternative code which avoids that.  */
72 
73 mp_limb_t
mpn_divexact_by3c(mp_ptr restrict rp,mp_srcptr restrict up,mp_size_t un,mp_limb_t c)74 mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
75 {
76   mp_limb_t  l, q, s;
77   mp_size_t  i;
78 
79   ASSERT (un >= 1);
80   ASSERT (c == 0 || c == 1 || c == 2);
81   ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
82 
83   i = 0;
84   do
85     {
86       s = up[i];
87       SUBC_LIMB (c, l, s, c);
88 
89       q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
90       rp[i] = q;
91 
92       c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
93       c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
94     }
95   while (++i < un);
96 
97   ASSERT (c == 0 || c == 1 || c == 2);
98   return c;
99 }
100 
101 
102 #endif
103 
104 #if DIVEXACT_BY3_METHOD == 2
105 
106 /* The following alternative code re-arranges the quotient calculation from
107    (src[i]-c)*inverse to instead
108 
109        q = src[i]*inverse - c*inverse
110 
111    thereby allowing src[i]*inverse to be scheduled back as far as desired,
112    making full use of multiplier throughput and leaving just some carry
113    handing on the dependent chain.
114 
115    The carry handling consists of determining the c for the next iteration.
116    This is the same as described above, namely look for any borrow from
117    src[i]-c, and at the high of 3*q.
118 
119    high(3*q) is done with two comparisons as above (in c2 and c3).  The
120    borrow from src[i]-c is incorporated into those by noting that if there's
121    a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
122    giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
123    enough to make high(3*q) come out 1 bigger, as required.
124 
125    l = -c*inverse is calculated at the same time as c, since for most chips
126    it can be more conveniently derived from separate c1/c2/c3 values than
127    from a combined c equal to 0, 1 or 2.
128 
129    The net effect is that with good pipelining this loop should be able to
130    run at perhaps 4 cycles/limb, depending on available execute resources
131    etc.
132 
133    Usage:
134 
135    This code is not used by default, since we really can't rely on the
136    compiler generating a good software pipeline, nor on such an approach
137    even being worthwhile on all CPUs.
138 
139    Itanium is one chip where this algorithm helps though, see
140    mpn/ia64/diveby3.asm.  */
141 
142 mp_limb_t
mpn_divexact_by3c(mp_ptr restrict rp,mp_srcptr restrict up,mp_size_t un,mp_limb_t cy)143 mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
144 {
145   mp_limb_t  s, sm, cl, q, qx, c2, c3;
146   mp_size_t  i;
147 
148   ASSERT (un >= 1);
149   ASSERT (cy == 0 || cy == 1 || cy == 2);
150   ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
151 
152   cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
153 
154   for (i = 0; i < un; i++)
155     {
156       s = up[i];
157       sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
158 
159       q = (cl + sm) & GMP_NUMB_MASK;
160       rp[i] = q;
161       qx = q + (s < cy);
162 
163       c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
164       c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
165 
166       cy = c2 + c3;
167       cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
168     }
169 
170   return cy;
171 }
172 
173 #endif
174