1 /* mulmod_bnm1.c -- multiplication mod B^n-1.
2
3 Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
4 Marco Bodrato.
5
6 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10 Copyright 2009, 2010 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 License for more details.
23
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
26
27
28 #include "gmp.h"
29 #include "gmp-impl.h"
30 #include "longlong.h"
31
32 /* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
33 mod B^rn - 1, and values are semi-normalised; zero is represented
34 as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp.
35 tp==rp is allowed. */
36 void
mpn_bc_mulmod_bnm1(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t rn,mp_ptr tp)37 mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
38 mp_ptr tp)
39 {
40 mp_limb_t cy;
41
42 ASSERT (0 < rn);
43
44 mpn_mul_n (tp, ap, bp, rn);
45 cy = mpn_add_n (rp, tp, tp + rn, rn);
46 /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
47 * be no overflow when adding in the carry. */
48 MPN_INCR_U (rp, rn, cy);
49 }
50
51
52 /* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
53 semi-normalised representation, computation is mod B^rn + 1. Needs
54 a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
55 Output is normalised. */
56 static void
mpn_bc_mulmod_bnp1(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t rn,mp_ptr tp)57 mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
58 mp_ptr tp)
59 {
60 mp_limb_t cy;
61
62 ASSERT (0 < rn);
63
64 mpn_mul_n (tp, ap, bp, rn + 1);
65 ASSERT (tp[2*rn+1] == 0);
66 ASSERT (tp[2*rn] < GMP_NUMB_MAX);
67 cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
68 rp[rn] = 0;
69 MPN_INCR_U (rp, rn+1, cy );
70 }
71
72
73 /* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
74 *
75 * The result is expected to be ZERO if and only if one of the operand
76 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
77 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
78 * combine results and obtain a natural number when one knows in
79 * advance that the final value is less than (B^rn-1).
80 * Moreover it should not be a problem if mulmod_bnm1 is used to
81 * compute the full product with an+bn <= rn, because this condition
82 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
83 *
84 * Requires 0 < bn <= an <= rn and an + bn > rn/2
85 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
86 *
87 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
88 */
89 void
mpn_mulmod_bnm1(mp_ptr rp,mp_size_t rn,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr tp)90 mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
91 {
92 ASSERT (0 < bn);
93 ASSERT (bn <= an);
94 ASSERT (an <= rn);
95
96 if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
97 {
98 if (UNLIKELY (bn < rn))
99 {
100 if (UNLIKELY (an + bn <= rn))
101 {
102 mpn_mul (rp, ap, an, bp, bn);
103 }
104 else
105 {
106 mp_limb_t cy;
107 mpn_mul (tp, ap, an, bp, bn);
108 cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
109 MPN_INCR_U (rp, rn, cy);
110 }
111 }
112 else
113 mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
114 }
115 else
116 {
117 mp_size_t n;
118 mp_limb_t cy;
119 mp_limb_t hi;
120
121 n = rn >> 1;
122
123 /* We need at least an + bn >= n, to be able to fit one of the
124 recursive products at rp. Requiring strict inequality makes
125 the coded slightly simpler. If desired, we could avoid this
126 restriction by initially halving rn as long as rn is even and
127 an + bn <= rn/2. */
128
129 ASSERT (an + bn > n);
130
131 /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
132 and crt together as
133
134 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
135 */
136
137 #define a0 ap
138 #define a1 (ap + n)
139 #define b0 bp
140 #define b1 (bp + n)
141
142 #define xp tp /* 2n + 2 */
143 /* am1 maybe in {xp, n} */
144 /* bm1 maybe in {xp + n, n} */
145 #define sp1 (tp + 2*n + 2)
146 /* ap1 maybe in {sp1, n + 1} */
147 /* bp1 maybe in {sp1 + n + 1, n + 1} */
148
149 {
150 mp_srcptr am1, bm1;
151 mp_size_t anm, bnm;
152 mp_ptr so;
153
154 if (LIKELY (an > n))
155 {
156 am1 = xp;
157 cy = mpn_add (xp, a0, n, a1, an - n);
158 MPN_INCR_U (xp, n, cy);
159 anm = n;
160 if (LIKELY (bn > n))
161 {
162 bm1 = xp + n;
163 cy = mpn_add (xp + n, b0, n, b1, bn - n);
164 MPN_INCR_U (xp + n, n, cy);
165 bnm = n;
166 so = xp + 2*n;
167 }
168 else
169 {
170 so = xp + n;
171 bm1 = b0;
172 bnm = bn;
173 }
174 }
175 else
176 {
177 so = xp;
178 am1 = a0;
179 anm = an;
180 bm1 = b0;
181 bnm = bn;
182 }
183
184 mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
185 }
186
187 {
188 int k;
189 mp_srcptr ap1, bp1;
190 mp_size_t anp, bnp;
191
192 if (LIKELY (an > n)) {
193 ap1 = sp1;
194 cy = mpn_sub (sp1, a0, n, a1, an - n);
195 sp1[n] = 0;
196 MPN_INCR_U (sp1, n + 1, cy);
197 anp = n + ap1[n];
198 } else {
199 ap1 = a0;
200 anp = an;
201 }
202
203 if (LIKELY (bn > n)) {
204 bp1 = sp1 + n + 1;
205 cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
206 sp1[2*n+1] = 0;
207 MPN_INCR_U (sp1 + n + 1, n + 1, cy);
208 bnp = n + bp1[n];
209 } else {
210 bp1 = b0;
211 bnp = bn;
212 }
213
214 if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
215 k=0;
216 else
217 {
218 int mask;
219 k = mpn_fft_best_k (n, 0);
220 mask = (1<<k) -1;
221 while (n & mask) {k--; mask >>=1;};
222 }
223 if (k >= FFT_FIRST_K)
224 xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
225 else if (UNLIKELY (bp1 == b0))
226 {
227 ASSERT (anp + bnp <= 2*n+1);
228 ASSERT (anp + bnp > n);
229 ASSERT (anp >= bnp);
230 mpn_mul (xp, ap1, anp, bp1, bnp);
231 anp = anp + bnp - n;
232 ASSERT (anp <= n || xp[2*n]==0);
233 anp-= anp > n;
234 cy = mpn_sub (xp, xp, n, xp + n, anp);
235 xp[n] = 0;
236 MPN_INCR_U (xp, n+1, cy);
237 }
238 else
239 mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
240 }
241
242 /* Here the CRT recomposition begins.
243
244 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
245 Division by 2 is a bitwise rotation.
246
247 Assumes xp normalised mod (B^n+1).
248
249 The residue class [0] is represented by [B^n-1]; except when
250 both input are ZERO.
251 */
252
253 #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
254 #if HAVE_NATIVE_mpn_rsh1add_nc
255 cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
256 hi = cy << (GMP_NUMB_BITS - 1);
257 cy = 0;
258 /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
259 overflows, i.e. a further increment will not overflow again. */
260 #else /* ! _nc */
261 cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
262 hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
263 cy >>= 1;
264 /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
265 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
266 #endif
267 #if GMP_NAIL_BITS == 0
268 add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
269 #else
270 cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
271 rp[n-1] ^= hi;
272 #endif
273 #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
274 #if HAVE_NATIVE_mpn_add_nc
275 cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
276 #else /* ! _nc */
277 cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
278 #endif
279 cy += (rp[0]&1);
280 mpn_rshift(rp, rp, n, 1);
281 ASSERT (cy <= 2);
282 hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
283 cy >>= 1;
284 /* We can have cy != 0 only if hi = 0... */
285 ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
286 rp[n-1] |= hi;
287 /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
288 #endif
289 ASSERT (cy <= 1);
290 /* Next increment can not overflow, read the previous comments about cy. */
291 ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
292 MPN_INCR_U(rp, n, cy);
293
294 /* Compute the highest half:
295 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
296 */
297 if (UNLIKELY (an + bn < rn))
298 {
299 /* Note that in this case, the only way the result can equal
300 zero mod B^{rn} - 1 is if one of the inputs is zero, and
301 then the output of both the recursive calls and this CRT
302 reconstruction is zero, not B^{rn} - 1. Which is good,
303 since the latter representation doesn't fit in the output
304 area.*/
305 cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
306
307 /* FIXME: This subtraction of the high parts is not really
308 necessary, we do it to get the carry out, and for sanity
309 checking. */
310 cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
311 xp + an + bn - n, rn - (an + bn), cy);
312 ASSERT (an + bn == rn - 1 ||
313 mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
314 cy = mpn_sub_1 (rp, rp, an + bn, cy);
315 ASSERT (cy == (xp + an + bn - n)[0]);
316 }
317 else
318 {
319 cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
320 /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
321 DECR will affect _at most_ the lowest n limbs. */
322 MPN_DECR_U (rp, 2*n, cy);
323 }
324 #undef a0
325 #undef a1
326 #undef b0
327 #undef b1
328 #undef xp
329 #undef sp1
330 }
331 }
332
333 mp_size_t
mpn_mulmod_bnm1_next_size(mp_size_t n)334 mpn_mulmod_bnm1_next_size (mp_size_t n)
335 {
336 mp_size_t nh;
337
338 if (BELOW_THRESHOLD (n, MULMOD_BNM1_THRESHOLD))
339 return n;
340 if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
341 return (n + (2-1)) & (-2);
342 if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
343 return (n + (4-1)) & (-4);
344
345 nh = (n + 1) >> 1;
346
347 if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
348 return (n + (8-1)) & (-8);
349
350 return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
351 }
352