xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/mod_1_4.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_mod_1s_4p (ap, n, b, cps)
2    Divide (ap,,n) by b.  Return the single-limb remainder.
3    Requires that d < B / 4.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6    Based on a suggestion by Peter L. Montgomery.
7 
8    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
9    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
10    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
11 
12 Copyright 2008-2010 Free Software Foundation, Inc.
13 
14 This file is part of the GNU MP Library.
15 
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of either:
18 
19   * the GNU Lesser General Public License as published by the Free
20     Software Foundation; either version 3 of the License, or (at your
21     option) any later version.
22 
23 or
24 
25   * the GNU General Public License as published by the Free Software
26     Foundation; either version 2 of the License, or (at your option) any
27     later version.
28 
29 or both in parallel, as here.
30 
31 The GNU MP Library is distributed in the hope that it will be useful, but
32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34 for more details.
35 
36 You should have received copies of the GNU General Public License and the
37 GNU Lesser General Public License along with the GNU MP Library.  If not,
38 see https://www.gnu.org/licenses/.  */
39 
40 #include "gmp-impl.h"
41 #include "longlong.h"
42 
43 #include "mpn/sparc64/sparc64.h"
44 
45 void
mpn_mod_1s_4p_cps(mp_limb_t cps[7],mp_limb_t b)46 mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
47 {
48   mp_limb_t bi;
49   mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
50   int cnt;
51 
52   ASSERT (b <= (~(mp_limb_t) 0) / 4);
53 
54   count_leading_zeros (cnt, b);
55 
56   b <<= cnt;
57   invert_limb (bi, b);
58 
59   cps[0] = bi;
60   cps[1] = cnt;
61 
62   B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
63   ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
64   cps[2] = B1modb >> cnt;
65 
66   udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
67   cps[3] = B2modb >> cnt;
68 
69   udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
70   cps[4] = B3modb >> cnt;
71 
72   udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
73   cps[5] = B4modb >> cnt;
74 
75   udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi);
76   cps[6] = B5modb >> cnt;
77 
78 #if WANT_ASSERT
79   {
80     int i;
81     b = cps[2];
82     for (i = 3; i <= 6; i++)
83       {
84 	b += cps[i];
85 	ASSERT (b >= cps[i]);
86       }
87   }
88 #endif
89 }
90 
91 mp_limb_t
mpn_mod_1s_4p(mp_srcptr ap,mp_size_t n,mp_limb_t b,const mp_limb_t cps[7])92 mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
93 {
94   mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
95   mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
96   mp_size_t i;
97   int cnt;
98 
99   ASSERT (n >= 1);
100 
101   B1modb = cps[2];
102   B2modb = cps[3];
103   B3modb = cps[4];
104   B4modb = cps[5];
105   B5modb = cps[6];
106 
107   if ((b >> 32) == 0)
108     {
109       switch (n & 3)
110 	{
111 	case 0:
112 	  umul_ppmm_s (ph, pl, ap[n - 3], B1modb);
113 	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]);
114 	  umul_ppmm_s (ch, cl, ap[n - 2], B2modb);
115 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
116 	  umul_ppmm_s (rh, rl, ap[n - 1], B3modb);
117 	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
118 	  n -= 4;
119 	  break;
120 	case 1:
121 	  rh = 0;
122 	  rl = ap[n - 1];
123 	  n -= 1;
124 	  break;
125 	case 2:
126 	  rh = ap[n - 1];
127 	  rl = ap[n - 2];
128 	  n -= 2;
129 	  break;
130 	case 3:
131 	  umul_ppmm_s (ph, pl, ap[n - 2], B1modb);
132 	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
133 	  umul_ppmm_s (rh, rl, ap[n - 1], B2modb);
134 	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
135 	  n -= 3;
136 	  break;
137 	}
138 
139       for (i = n - 4; i >= 0; i -= 4)
140 	{
141 	  /* rr = ap[i]				< B
142 		+ ap[i+1] * (B mod b)		<= (B-1)(b-1)
143 		+ ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
144 		+ ap[i+3] * (B^3 mod b)		<= (B-1)(b-1)
145 		+ LO(rr)  * (B^4 mod b)		<= (B-1)(b-1)
146 		+ HI(rr)  * (B^5 mod b)		<= (B-1)(b-1)
147 	  */
148 	  umul_ppmm_s (ph, pl, ap[i + 1], B1modb);
149 	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
150 
151 	  umul_ppmm_s (ch, cl, ap[i + 2], B2modb);
152 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
153 
154 	  umul_ppmm_s (ch, cl, ap[i + 3], B3modb);
155 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
156 
157 	  umul_ppmm_s (ch, cl, rl, B4modb);
158 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
159 
160 	  umul_ppmm_s (rh, rl, rh, B5modb);
161 	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
162 	}
163 
164       umul_ppmm_s (rh, cl, rh, B1modb);
165       add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
166     }
167   else
168     {
169       switch (n & 3)
170 	{
171 	case 0:
172 	  umul_ppmm (ph, pl, ap[n - 3], B1modb);
173 	  add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
174 	  umul_ppmm (ch, cl, ap[n - 2], B2modb);
175 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
176 	  umul_ppmm (rh, rl, ap[n - 1], B3modb);
177 	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
178 	  n -= 4;
179 	  break;
180 	case 1:
181 	  rh = 0;
182 	  rl = ap[n - 1];
183 	  n -= 1;
184 	  break;
185 	case 2:
186 	  rh = ap[n - 1];
187 	  rl = ap[n - 2];
188 	  n -= 2;
189 	  break;
190 	case 3:
191 	  umul_ppmm (ph, pl, ap[n - 2], B1modb);
192 	  add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
193 	  umul_ppmm (rh, rl, ap[n - 1], B2modb);
194 	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
195 	  n -= 3;
196 	  break;
197 	}
198 
199       for (i = n - 4; i >= 0; i -= 4)
200 	{
201 	  /* rr = ap[i]				< B
202 		+ ap[i+1] * (B mod b)		<= (B-1)(b-1)
203 		+ ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
204 		+ ap[i+3] * (B^3 mod b)		<= (B-1)(b-1)
205 		+ LO(rr)  * (B^4 mod b)		<= (B-1)(b-1)
206 		+ HI(rr)  * (B^5 mod b)		<= (B-1)(b-1)
207 	  */
208 	  umul_ppmm (ph, pl, ap[i + 1], B1modb);
209 	  add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
210 
211 	  umul_ppmm (ch, cl, ap[i + 2], B2modb);
212 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
213 
214 	  umul_ppmm (ch, cl, ap[i + 3], B3modb);
215 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
216 
217 	  umul_ppmm (ch, cl, rl, B4modb);
218 	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
219 
220 	  umul_ppmm (rh, rl, rh, B5modb);
221 	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
222 	}
223 
224       umul_ppmm (rh, cl, rh, B1modb);
225       add_ssaaaa (rh, rl, rh, rl, 0, cl);
226     }
227 
228   bi = cps[0];
229   cnt = cps[1];
230 
231   r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
232   udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
233 
234   return r >> cnt;
235 }
236