1 /* mpn_mod_1s_4p (ap, n, b, cps)
2 Divide (ap,,n) by b. Return the single-limb remainder.
3 Requires that d < B / 4.
4
5 Contributed to the GNU project by Torbjorn Granlund.
6 Based on a suggestion by Peter L. Montgomery.
7
8 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
9 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
10 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
11
12 Copyright 2008-2010 Free Software Foundation, Inc.
13
14 This file is part of the GNU MP Library.
15
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of either:
18
19 * the GNU Lesser General Public License as published by the Free
20 Software Foundation; either version 3 of the License, or (at your
21 option) any later version.
22
23 or
24
25 * the GNU General Public License as published by the Free Software
26 Foundation; either version 2 of the License, or (at your option) any
27 later version.
28
29 or both in parallel, as here.
30
31 The GNU MP Library is distributed in the hope that it will be useful, but
32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
34 for more details.
35
36 You should have received copies of the GNU General Public License and the
37 GNU Lesser General Public License along with the GNU MP Library. If not,
38 see https://www.gnu.org/licenses/. */
39
40 #include "gmp-impl.h"
41 #include "longlong.h"
42
43 #include "mpn/sparc64/sparc64.h"
44
45 void
mpn_mod_1s_4p_cps(mp_limb_t cps[7],mp_limb_t b)46 mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
47 {
48 mp_limb_t bi;
49 mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
50 int cnt;
51
52 ASSERT (b <= (~(mp_limb_t) 0) / 4);
53
54 count_leading_zeros (cnt, b);
55
56 b <<= cnt;
57 invert_limb (bi, b);
58
59 cps[0] = bi;
60 cps[1] = cnt;
61
62 B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
63 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
64 cps[2] = B1modb >> cnt;
65
66 udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
67 cps[3] = B2modb >> cnt;
68
69 udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
70 cps[4] = B3modb >> cnt;
71
72 udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
73 cps[5] = B4modb >> cnt;
74
75 udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi);
76 cps[6] = B5modb >> cnt;
77
78 #if WANT_ASSERT
79 {
80 int i;
81 b = cps[2];
82 for (i = 3; i <= 6; i++)
83 {
84 b += cps[i];
85 ASSERT (b >= cps[i]);
86 }
87 }
88 #endif
89 }
90
91 mp_limb_t
mpn_mod_1s_4p(mp_srcptr ap,mp_size_t n,mp_limb_t b,const mp_limb_t cps[7])92 mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
93 {
94 mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
95 mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
96 mp_size_t i;
97 int cnt;
98
99 ASSERT (n >= 1);
100
101 B1modb = cps[2];
102 B2modb = cps[3];
103 B3modb = cps[4];
104 B4modb = cps[5];
105 B5modb = cps[6];
106
107 if ((b >> 32) == 0)
108 {
109 switch (n & 3)
110 {
111 case 0:
112 umul_ppmm_s (ph, pl, ap[n - 3], B1modb);
113 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]);
114 umul_ppmm_s (ch, cl, ap[n - 2], B2modb);
115 add_ssaaaa (ph, pl, ph, pl, ch, cl);
116 umul_ppmm_s (rh, rl, ap[n - 1], B3modb);
117 add_ssaaaa (rh, rl, rh, rl, ph, pl);
118 n -= 4;
119 break;
120 case 1:
121 rh = 0;
122 rl = ap[n - 1];
123 n -= 1;
124 break;
125 case 2:
126 rh = ap[n - 1];
127 rl = ap[n - 2];
128 n -= 2;
129 break;
130 case 3:
131 umul_ppmm_s (ph, pl, ap[n - 2], B1modb);
132 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
133 umul_ppmm_s (rh, rl, ap[n - 1], B2modb);
134 add_ssaaaa (rh, rl, rh, rl, ph, pl);
135 n -= 3;
136 break;
137 }
138
139 for (i = n - 4; i >= 0; i -= 4)
140 {
141 /* rr = ap[i] < B
142 + ap[i+1] * (B mod b) <= (B-1)(b-1)
143 + ap[i+2] * (B^2 mod b) <= (B-1)(b-1)
144 + ap[i+3] * (B^3 mod b) <= (B-1)(b-1)
145 + LO(rr) * (B^4 mod b) <= (B-1)(b-1)
146 + HI(rr) * (B^5 mod b) <= (B-1)(b-1)
147 */
148 umul_ppmm_s (ph, pl, ap[i + 1], B1modb);
149 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
150
151 umul_ppmm_s (ch, cl, ap[i + 2], B2modb);
152 add_ssaaaa (ph, pl, ph, pl, ch, cl);
153
154 umul_ppmm_s (ch, cl, ap[i + 3], B3modb);
155 add_ssaaaa (ph, pl, ph, pl, ch, cl);
156
157 umul_ppmm_s (ch, cl, rl, B4modb);
158 add_ssaaaa (ph, pl, ph, pl, ch, cl);
159
160 umul_ppmm_s (rh, rl, rh, B5modb);
161 add_ssaaaa (rh, rl, rh, rl, ph, pl);
162 }
163
164 umul_ppmm_s (rh, cl, rh, B1modb);
165 add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
166 }
167 else
168 {
169 switch (n & 3)
170 {
171 case 0:
172 umul_ppmm (ph, pl, ap[n - 3], B1modb);
173 add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
174 umul_ppmm (ch, cl, ap[n - 2], B2modb);
175 add_ssaaaa (ph, pl, ph, pl, ch, cl);
176 umul_ppmm (rh, rl, ap[n - 1], B3modb);
177 add_ssaaaa (rh, rl, rh, rl, ph, pl);
178 n -= 4;
179 break;
180 case 1:
181 rh = 0;
182 rl = ap[n - 1];
183 n -= 1;
184 break;
185 case 2:
186 rh = ap[n - 1];
187 rl = ap[n - 2];
188 n -= 2;
189 break;
190 case 3:
191 umul_ppmm (ph, pl, ap[n - 2], B1modb);
192 add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
193 umul_ppmm (rh, rl, ap[n - 1], B2modb);
194 add_ssaaaa (rh, rl, rh, rl, ph, pl);
195 n -= 3;
196 break;
197 }
198
199 for (i = n - 4; i >= 0; i -= 4)
200 {
201 /* rr = ap[i] < B
202 + ap[i+1] * (B mod b) <= (B-1)(b-1)
203 + ap[i+2] * (B^2 mod b) <= (B-1)(b-1)
204 + ap[i+3] * (B^3 mod b) <= (B-1)(b-1)
205 + LO(rr) * (B^4 mod b) <= (B-1)(b-1)
206 + HI(rr) * (B^5 mod b) <= (B-1)(b-1)
207 */
208 umul_ppmm (ph, pl, ap[i + 1], B1modb);
209 add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
210
211 umul_ppmm (ch, cl, ap[i + 2], B2modb);
212 add_ssaaaa (ph, pl, ph, pl, ch, cl);
213
214 umul_ppmm (ch, cl, ap[i + 3], B3modb);
215 add_ssaaaa (ph, pl, ph, pl, ch, cl);
216
217 umul_ppmm (ch, cl, rl, B4modb);
218 add_ssaaaa (ph, pl, ph, pl, ch, cl);
219
220 umul_ppmm (rh, rl, rh, B5modb);
221 add_ssaaaa (rh, rl, rh, rl, ph, pl);
222 }
223
224 umul_ppmm (rh, cl, rh, B1modb);
225 add_ssaaaa (rh, rl, rh, rl, 0, cl);
226 }
227
228 bi = cps[0];
229 cnt = cps[1];
230
231 r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
232 udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
233
234 return r >> cnt;
235 }
236