186d7f5d3SJohn Marino /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
486d7f5d3SJohn Marino CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
586d7f5d3SJohn Marino FUTURE GNU MP RELEASES.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino This file is part of the GNU MP Library.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1286d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1386d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1486d7f5d3SJohn Marino option) any later version.
1586d7f5d3SJohn Marino
1686d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1786d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1886d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1986d7f5d3SJohn Marino License for more details.
2086d7f5d3SJohn Marino
2186d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2286d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2386d7f5d3SJohn Marino
2486d7f5d3SJohn Marino #include "gmp.h"
2586d7f5d3SJohn Marino #include "gmp-impl.h"
2686d7f5d3SJohn Marino #include "longlong.h"
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino /* Calculate an r satisfying
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino r*B^k + a - c == q*d
3286d7f5d3SJohn Marino
3386d7f5d3SJohn Marino where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
3486d7f5d3SJohn Marino (the caller won't know which), and q is the quotient (discarded). d must
3586d7f5d3SJohn Marino be odd, c can be any limb value.
3686d7f5d3SJohn Marino
3786d7f5d3SJohn Marino If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
3886d7f5d3SJohn Marino
3986d7f5d3SJohn Marino This slightly strange function suits the initial Nx1 reduction for GCDs
4086d7f5d3SJohn Marino or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
4186d7f5d3SJohn Marino -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
4286d7f5d3SJohn Marino ignored, or for the Jacobi symbol it can be accounted for. The function
4386d7f5d3SJohn Marino also suits divisibility and congruence testing since if r=0 (or r=d) is
4486d7f5d3SJohn Marino obtained then a==c mod d.
4586d7f5d3SJohn Marino
4686d7f5d3SJohn Marino
4786d7f5d3SJohn Marino r is a bit like the remainder returned by mpn_divexact_by3c, and is the
4886d7f5d3SJohn Marino sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r
4986d7f5d3SJohn Marino represents a borrow, since effectively quotient limbs are chosen so that
5086d7f5d3SJohn Marino subtracting that multiple of d from src at each step will produce a zero
5186d7f5d3SJohn Marino limb.
5286d7f5d3SJohn Marino
5386d7f5d3SJohn Marino A long calculation can be done piece by piece from low to high by passing
5486d7f5d3SJohn Marino the return value from one part as the carry parameter to the next part.
5586d7f5d3SJohn Marino The effective final k becomes anything between size and size-n, if n
5686d7f5d3SJohn Marino pieces are used.
5786d7f5d3SJohn Marino
5886d7f5d3SJohn Marino
5986d7f5d3SJohn Marino A similar sort of routine could be constructed based on adding multiples
6086d7f5d3SJohn Marino of d at each limb, much like redc in mpz_powm does. Subtracting however
6186d7f5d3SJohn Marino has a small advantage that when subtracting to cancel out l there's never
6286d7f5d3SJohn Marino a borrow into h, whereas using an addition would put a carry into h
6386d7f5d3SJohn Marino depending whether l==0 or l!=0.
6486d7f5d3SJohn Marino
6586d7f5d3SJohn Marino
6686d7f5d3SJohn Marino In terms of efficiency, this function is similar to a mul-by-inverse
6786d7f5d3SJohn Marino mpn_mod_1. Both are essentially two multiplies and are best suited to
6886d7f5d3SJohn Marino CPUs with low latency multipliers (in comparison to a divide instruction
6986d7f5d3SJohn Marino at least.) But modexact has a few less supplementary operations, only
7086d7f5d3SJohn Marino needs low part and high part multiplies, and has fewer working quantities
7186d7f5d3SJohn Marino (helping CPUs with few registers).
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino
7486d7f5d3SJohn Marino In the main loop it will be noted that the new carry (call it r) is the
7586d7f5d3SJohn Marino sum of the high product h and any borrow from l=s-c. If c<d then we will
7686d7f5d3SJohn Marino have r<d too, for the following reasons. Let q=l*inverse be the quotient
7786d7f5d3SJohn Marino limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then
7886d7f5d3SJohn Marino
7986d7f5d3SJohn Marino l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
8086d7f5d3SJohn Marino
8186d7f5d3SJohn Marino But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
8286d7f5d3SJohn Marino never have h=d-1 and so r=h+borrow <= d-1.
8386d7f5d3SJohn Marino
8486d7f5d3SJohn Marino When c>=d, on the other hand, h=d-1 can certainly occur together with a
8586d7f5d3SJohn Marino borrow, thereby giving only r<=d, as per the function definition above.
8686d7f5d3SJohn Marino
8786d7f5d3SJohn Marino As a design decision it's left to the caller to check for r=d if it might
8886d7f5d3SJohn Marino be passing c>=d. Several applications have c<d initially so the extra
8986d7f5d3SJohn Marino test is often unnecessary, for example the GCDs or a plain divisibility
9086d7f5d3SJohn Marino d|a test will pass c=0.
9186d7f5d3SJohn Marino
9286d7f5d3SJohn Marino
9386d7f5d3SJohn Marino The special case for size==1 is so that it can be assumed c<=d in the
9486d7f5d3SJohn Marino high<=divisor test at the end. c<=d is only guaranteed after at least
9586d7f5d3SJohn Marino one iteration of the main loop. There's also a decent chance one % is
9686d7f5d3SJohn Marino faster than a binvert_limb, though that will depend on the processor.
9786d7f5d3SJohn Marino
9886d7f5d3SJohn Marino A CPU specific implementation might want to omit the size==1 code or the
9986d7f5d3SJohn Marino high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
10086d7f5d3SJohn Marino useful. */
10186d7f5d3SJohn Marino
10286d7f5d3SJohn Marino
10386d7f5d3SJohn Marino mp_limb_t
mpn_modexact_1c_odd(mp_srcptr src,mp_size_t size,mp_limb_t d,mp_limb_t orig_c)10486d7f5d3SJohn Marino mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
10586d7f5d3SJohn Marino mp_limb_t orig_c)
10686d7f5d3SJohn Marino {
10786d7f5d3SJohn Marino mp_limb_t s, h, l, inverse, dummy, dmul, ret;
10886d7f5d3SJohn Marino mp_limb_t c = orig_c;
10986d7f5d3SJohn Marino mp_size_t i;
11086d7f5d3SJohn Marino
11186d7f5d3SJohn Marino ASSERT (size >= 1);
11286d7f5d3SJohn Marino ASSERT (d & 1);
11386d7f5d3SJohn Marino ASSERT_MPN (src, size);
11486d7f5d3SJohn Marino ASSERT_LIMB (d);
11586d7f5d3SJohn Marino ASSERT_LIMB (c);
11686d7f5d3SJohn Marino
11786d7f5d3SJohn Marino if (size == 1)
11886d7f5d3SJohn Marino {
11986d7f5d3SJohn Marino s = src[0];
12086d7f5d3SJohn Marino if (s > c)
12186d7f5d3SJohn Marino {
12286d7f5d3SJohn Marino l = s-c;
12386d7f5d3SJohn Marino h = l % d;
12486d7f5d3SJohn Marino if (h != 0)
12586d7f5d3SJohn Marino h = d - h;
12686d7f5d3SJohn Marino }
12786d7f5d3SJohn Marino else
12886d7f5d3SJohn Marino {
12986d7f5d3SJohn Marino l = c-s;
13086d7f5d3SJohn Marino h = l % d;
13186d7f5d3SJohn Marino }
13286d7f5d3SJohn Marino return h;
13386d7f5d3SJohn Marino }
13486d7f5d3SJohn Marino
13586d7f5d3SJohn Marino
13686d7f5d3SJohn Marino binvert_limb (inverse, d);
13786d7f5d3SJohn Marino dmul = d << GMP_NAIL_BITS;
13886d7f5d3SJohn Marino
13986d7f5d3SJohn Marino i = 0;
14086d7f5d3SJohn Marino do
14186d7f5d3SJohn Marino {
14286d7f5d3SJohn Marino s = src[i];
14386d7f5d3SJohn Marino SUBC_LIMB (c, l, s, c);
14486d7f5d3SJohn Marino l = (l * inverse) & GMP_NUMB_MASK;
14586d7f5d3SJohn Marino umul_ppmm (h, dummy, l, dmul);
14686d7f5d3SJohn Marino c += h;
14786d7f5d3SJohn Marino }
14886d7f5d3SJohn Marino while (++i < size-1);
14986d7f5d3SJohn Marino
15086d7f5d3SJohn Marino
15186d7f5d3SJohn Marino s = src[i];
15286d7f5d3SJohn Marino if (s <= d)
15386d7f5d3SJohn Marino {
15486d7f5d3SJohn Marino /* With high<=d the final step can be a subtract and addback. If c==0
15586d7f5d3SJohn Marino then the addback will restore to l>=0. If c==d then will get l==d
15686d7f5d3SJohn Marino if s==0, but that's ok per the function definition. */
15786d7f5d3SJohn Marino
15886d7f5d3SJohn Marino l = c - s;
15986d7f5d3SJohn Marino if (c < s)
16086d7f5d3SJohn Marino l += d;
16186d7f5d3SJohn Marino
16286d7f5d3SJohn Marino ret = l;
16386d7f5d3SJohn Marino }
16486d7f5d3SJohn Marino else
16586d7f5d3SJohn Marino {
16686d7f5d3SJohn Marino /* Can't skip a divide, just do the loop code once more. */
16786d7f5d3SJohn Marino
16886d7f5d3SJohn Marino SUBC_LIMB (c, l, s, c);
16986d7f5d3SJohn Marino l = (l * inverse) & GMP_NUMB_MASK;
17086d7f5d3SJohn Marino umul_ppmm (h, dummy, l, dmul);
17186d7f5d3SJohn Marino c += h;
17286d7f5d3SJohn Marino ret = c;
17386d7f5d3SJohn Marino }
17486d7f5d3SJohn Marino
17586d7f5d3SJohn Marino ASSERT (orig_c < d ? ret < d : ret <= d);
17686d7f5d3SJohn Marino return ret;
17786d7f5d3SJohn Marino }
17886d7f5d3SJohn Marino
17986d7f5d3SJohn Marino
18086d7f5d3SJohn Marino
18186d7f5d3SJohn Marino #if 0
18286d7f5d3SJohn Marino
18386d7f5d3SJohn Marino /* The following is an alternate form that might shave one cycle on a
18486d7f5d3SJohn Marino superscalar processor since it takes c+=h off the dependent chain,
18586d7f5d3SJohn Marino leaving just a low product, high product, and a subtract.
18686d7f5d3SJohn Marino
18786d7f5d3SJohn Marino This is for CPU specific implementations to consider. A special case for
18886d7f5d3SJohn Marino high<divisor and/or size==1 can be added if desired.
18986d7f5d3SJohn Marino
19086d7f5d3SJohn Marino Notice that c is only ever 0 or 1, since if s-c produces a borrow then
19186d7f5d3SJohn Marino x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
19286d7f5d3SJohn Marino c=(x==0xFF..FF) too, if that helped. */
19386d7f5d3SJohn Marino
19486d7f5d3SJohn Marino mp_limb_t
19586d7f5d3SJohn Marino mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
19686d7f5d3SJohn Marino {
19786d7f5d3SJohn Marino mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
19886d7f5d3SJohn Marino mp_limb_t c = 0;
19986d7f5d3SJohn Marino mp_size_t i;
20086d7f5d3SJohn Marino
20186d7f5d3SJohn Marino ASSERT (size >= 1);
20286d7f5d3SJohn Marino ASSERT (d & 1);
20386d7f5d3SJohn Marino
20486d7f5d3SJohn Marino binvert_limb (inverse, d);
20586d7f5d3SJohn Marino dmul = d << GMP_NAIL_BITS;
20686d7f5d3SJohn Marino
20786d7f5d3SJohn Marino for (i = 0; i < size; i++)
20886d7f5d3SJohn Marino {
20986d7f5d3SJohn Marino ASSERT (c==0 || c==1);
21086d7f5d3SJohn Marino
21186d7f5d3SJohn Marino s = src[i];
21286d7f5d3SJohn Marino SUBC_LIMB (c1, x, s, c);
21386d7f5d3SJohn Marino
21486d7f5d3SJohn Marino SUBC_LIMB (c2, y, x, h);
21586d7f5d3SJohn Marino c = c1 + c2;
21686d7f5d3SJohn Marino
21786d7f5d3SJohn Marino y = (y * inverse) & GMP_NUMB_MASK;
21886d7f5d3SJohn Marino umul_ppmm (h, dummy, y, dmul);
21986d7f5d3SJohn Marino }
22086d7f5d3SJohn Marino
22186d7f5d3SJohn Marino h += c;
22286d7f5d3SJohn Marino return h;
22386d7f5d3SJohn Marino }
22486d7f5d3SJohn Marino
22586d7f5d3SJohn Marino #endif
226