186d7f5d3SJohn Marino /* mpn_mod_1s_3p (ap, n, b, cps)
286d7f5d3SJohn Marino Divide (ap,,n) by b. Return the single-limb remainder.
386d7f5d3SJohn Marino Requires that d < B / 4.
486d7f5d3SJohn Marino
586d7f5d3SJohn Marino Contributed to the GNU project by Torbjorn Granlund.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
886d7f5d3SJohn Marino SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
986d7f5d3SJohn Marino GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino Copyright 2008, 2009 Free Software Foundation, Inc.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino This file is part of the GNU MP Library.
1486d7f5d3SJohn Marino
1586d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1686d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1786d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1886d7f5d3SJohn Marino option) any later version.
1986d7f5d3SJohn Marino
2086d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2186d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2286d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2386d7f5d3SJohn Marino License for more details.
2486d7f5d3SJohn Marino
2586d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2686d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino #include "gmp.h"
2986d7f5d3SJohn Marino #include "gmp-impl.h"
3086d7f5d3SJohn Marino #include "longlong.h"
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino void
mpn_mod_1s_4p_cps(mp_limb_t cps[7],mp_limb_t b)3386d7f5d3SJohn Marino mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
3486d7f5d3SJohn Marino {
3586d7f5d3SJohn Marino mp_limb_t bi;
3686d7f5d3SJohn Marino mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
3786d7f5d3SJohn Marino int cnt;
3886d7f5d3SJohn Marino
3986d7f5d3SJohn Marino ASSERT (b <= (~(mp_limb_t) 0) / 4);
4086d7f5d3SJohn Marino
4186d7f5d3SJohn Marino count_leading_zeros (cnt, b);
4286d7f5d3SJohn Marino
4386d7f5d3SJohn Marino b <<= cnt;
4486d7f5d3SJohn Marino invert_limb (bi, b);
4586d7f5d3SJohn Marino
4686d7f5d3SJohn Marino B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
4786d7f5d3SJohn Marino ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
4886d7f5d3SJohn Marino udiv_rnd_preinv (B2modb, B1modb, b, bi);
4986d7f5d3SJohn Marino udiv_rnd_preinv (B3modb, B2modb, b, bi);
5086d7f5d3SJohn Marino udiv_rnd_preinv (B4modb, B3modb, b, bi);
5186d7f5d3SJohn Marino udiv_rnd_preinv (B5modb, B4modb, b, bi);
5286d7f5d3SJohn Marino
5386d7f5d3SJohn Marino cps[0] = bi;
5486d7f5d3SJohn Marino cps[1] = cnt;
5586d7f5d3SJohn Marino cps[2] = B1modb >> cnt;
5686d7f5d3SJohn Marino cps[3] = B2modb >> cnt;
5786d7f5d3SJohn Marino cps[4] = B3modb >> cnt;
5886d7f5d3SJohn Marino cps[5] = B4modb >> cnt;
5986d7f5d3SJohn Marino cps[6] = B5modb >> cnt;
6086d7f5d3SJohn Marino
6186d7f5d3SJohn Marino #if WANT_ASSERT
6286d7f5d3SJohn Marino {
6386d7f5d3SJohn Marino int i;
6486d7f5d3SJohn Marino b = cps[2];
6586d7f5d3SJohn Marino for (i = 3; i <= 6; i++)
6686d7f5d3SJohn Marino {
6786d7f5d3SJohn Marino b += cps[i];
6886d7f5d3SJohn Marino ASSERT (b >= cps[i]);
6986d7f5d3SJohn Marino }
7086d7f5d3SJohn Marino }
7186d7f5d3SJohn Marino #endif
7286d7f5d3SJohn Marino }
7386d7f5d3SJohn Marino
7486d7f5d3SJohn Marino mp_limb_t
mpn_mod_1s_4p(mp_srcptr ap,mp_size_t n,mp_limb_t b,mp_limb_t cps[7])7586d7f5d3SJohn Marino mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t cps[7])
7686d7f5d3SJohn Marino {
7786d7f5d3SJohn Marino mp_limb_t rh, rl, bi, q, ph, pl, ch, cl, r;
7886d7f5d3SJohn Marino mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
7986d7f5d3SJohn Marino mp_size_t i;
8086d7f5d3SJohn Marino int cnt;
8186d7f5d3SJohn Marino
8286d7f5d3SJohn Marino ASSERT (n >= 1);
8386d7f5d3SJohn Marino
8486d7f5d3SJohn Marino B1modb = cps[2];
8586d7f5d3SJohn Marino B2modb = cps[3];
8686d7f5d3SJohn Marino B3modb = cps[4];
8786d7f5d3SJohn Marino B4modb = cps[5];
8886d7f5d3SJohn Marino B5modb = cps[6];
8986d7f5d3SJohn Marino
9086d7f5d3SJohn Marino switch (n & 3)
9186d7f5d3SJohn Marino {
9286d7f5d3SJohn Marino case 0:
9386d7f5d3SJohn Marino umul_ppmm (ph, pl, ap[n - 3], B1modb);
9486d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
9586d7f5d3SJohn Marino umul_ppmm (ch, cl, ap[n - 2], B2modb);
9686d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, ch, cl);
9786d7f5d3SJohn Marino umul_ppmm (rh, rl, ap[n - 1], B3modb);
9886d7f5d3SJohn Marino add_ssaaaa (rh, rl, rh, rl, ph, pl);
9986d7f5d3SJohn Marino n -= 4;
10086d7f5d3SJohn Marino break;
10186d7f5d3SJohn Marino case 1:
10286d7f5d3SJohn Marino rh = 0;
10386d7f5d3SJohn Marino rl = ap[n - 1];
10486d7f5d3SJohn Marino n -= 1;
10586d7f5d3SJohn Marino break;
10686d7f5d3SJohn Marino case 2:
10786d7f5d3SJohn Marino umul_ppmm (ph, pl, ap[n - 1], B1modb);
10886d7f5d3SJohn Marino add_ssaaaa (rh, rl, ph, pl, 0, ap[n - 2]);
10986d7f5d3SJohn Marino n -= 2;
11086d7f5d3SJohn Marino break;
11186d7f5d3SJohn Marino case 3:
11286d7f5d3SJohn Marino umul_ppmm (ph, pl, ap[n - 2], B1modb);
11386d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
11486d7f5d3SJohn Marino umul_ppmm (rh, rl, ap[n - 1], B2modb);
11586d7f5d3SJohn Marino add_ssaaaa (rh, rl, rh, rl, ph, pl);
11686d7f5d3SJohn Marino n -= 3;
11786d7f5d3SJohn Marino break;
11886d7f5d3SJohn Marino }
11986d7f5d3SJohn Marino
12086d7f5d3SJohn Marino for (i = n - 4; i >= 0; i -= 4)
12186d7f5d3SJohn Marino {
12286d7f5d3SJohn Marino /* rr = ap[i] < B
12386d7f5d3SJohn Marino + ap[i+1] * (B mod b) <= (B-1)(b-1)
12486d7f5d3SJohn Marino + ap[i+2] * (B^2 mod b) <= (B-1)(b-1)
12586d7f5d3SJohn Marino + ap[i+3] * (B^3 mod b) <= (B-1)(b-1)
12686d7f5d3SJohn Marino + LO(rr) * (B^4 mod b) <= (B-1)(b-1)
12786d7f5d3SJohn Marino + HI(rr) * (B^5 mod b) <= (B-1)(b-1)
12886d7f5d3SJohn Marino */
12986d7f5d3SJohn Marino umul_ppmm (ph, pl, ap[i + 1], B1modb);
13086d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
13186d7f5d3SJohn Marino
13286d7f5d3SJohn Marino umul_ppmm (ch, cl, ap[i + 2], B2modb);
13386d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, ch, cl);
13486d7f5d3SJohn Marino
13586d7f5d3SJohn Marino umul_ppmm (ch, cl, ap[i + 3], B3modb);
13686d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, ch, cl);
13786d7f5d3SJohn Marino
13886d7f5d3SJohn Marino umul_ppmm (ch, cl, rl, B4modb);
13986d7f5d3SJohn Marino add_ssaaaa (ph, pl, ph, pl, ch, cl);
14086d7f5d3SJohn Marino
14186d7f5d3SJohn Marino umul_ppmm (rh, rl, rh, B5modb);
14286d7f5d3SJohn Marino add_ssaaaa (rh, rl, rh, rl, ph, pl);
14386d7f5d3SJohn Marino }
14486d7f5d3SJohn Marino
14586d7f5d3SJohn Marino bi = cps[0];
14686d7f5d3SJohn Marino cnt = cps[1];
14786d7f5d3SJohn Marino
14886d7f5d3SJohn Marino #if 1
14986d7f5d3SJohn Marino umul_ppmm (rh, cl, rh, B1modb);
15086d7f5d3SJohn Marino add_ssaaaa (rh, rl, rh, rl, 0, cl);
15186d7f5d3SJohn Marino r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
15286d7f5d3SJohn Marino #else
15386d7f5d3SJohn Marino udiv_qrnnd_preinv (q, r, rh >> (GMP_LIMB_BITS - cnt),
15486d7f5d3SJohn Marino (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)), b, bi);
15586d7f5d3SJohn Marino ASSERT (q <= 4); /* optimize for small quotient? */
15686d7f5d3SJohn Marino #endif
15786d7f5d3SJohn Marino
15886d7f5d3SJohn Marino udiv_qrnnd_preinv (q, r, r, rl << cnt, b, bi);
15986d7f5d3SJohn Marino
16086d7f5d3SJohn Marino return r >> cnt;
16186d7f5d3SJohn Marino }
162