xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/mod_1_1.c (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1 /* mpn_mod_1_1p (ap, n, b, cps)
2    Divide (ap,,n) by b.  Return the single-limb remainder.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Niels M�ller.
5    Based on a suggestion by Peter L. Montgomery.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19 
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23 License for more details.
24 
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27 
28 #include "gmp.h"
29 #include "gmp-impl.h"
30 #include "longlong.h"
31 
32 #ifndef MOD_1_1P_METHOD
33 # define MOD_1_1P_METHOD 1    /* need to make sure this is 2 for asm testing */
34 #endif
35 
36 /* Define some longlong.h-style macros, but for wider operations.
37  * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
38  * carry out, in the form of a mask. */
39 
40 #if defined (__GNUC__)
41 
42 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
43 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
44   __asm__ (  "add	%6, %k2\n\t"					\
45 	     "adc	%4, %k1\n\t"					\
46 	     "sbb	%k0, %k0"					\
47 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
48 	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
49 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
50 #endif
51 
52 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
53 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
54   __asm__ (  "add	%6, %q2\n\t"					\
55 	     "adc	%4, %q1\n\t"					\
56 	     "sbb	%q0, %q0"					\
57 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
58 	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
59 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
60 #endif
61 
62 #if defined (__sparc__) && W_TYPE_SIZE == 32
63 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
64   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
65 	     "addxcc	%r3, %4, %1\n\t"				\
66 	     "subx	%%g0, %%g0, %0"					\
67 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
68 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
69 	 __CLOBBER_CC)
70 #endif
71 
72 #if defined (__sparc__) && W_TYPE_SIZE == 64
73 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
74   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
75 	     "addccc	%r7, %8, %%g0\n\t"				\
76 	     "addccc	%r3, %4, %1\n\t"				\
77 	     "clr	%0\n\t"						\
78 	     "movcs	%%xcc, -1, %0"					\
79 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
80 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
81 	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
82 	 __CLOBBER_CC)
83 #endif
84 
85 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
86 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
87    processor running in 32-bit mode, since the carry flag then gets the 32-bit
88    carry.  */
89 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
90   __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
91 	     "adde	%1, %3, %4\n\t"					\
92 	     "subfe	%0, %0, %0\n\t"					\
93 	     "nor	%0, %0, %0"					\
94 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
95 	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
96 #endif
97 
98 #if defined (__s390x__) && W_TYPE_SIZE == 64
99 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
100   __asm__ (  "algr	%2, %6\n\t"					\
101 	     "alcgr	%1, %4\n\t"					\
102 	     "lghi	%0, 0\n\t"					\
103 	     "alcgr	%0, %0\n\t"					\
104 	     "lcgr	%0, %0"						\
105 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
106 	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
107 	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
108 #endif
109 
110 #if defined (__arm__) && W_TYPE_SIZE == 32
111 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
112   __asm__ (  "adds	%2, %5, %6\n\t"					\
113 	     "adcs	%1, %3, %4\n\t"					\
114 	     "movcc	%0, #0\n\t"					\
115 	     "movcs	%0, #-1"					\
116 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
117 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
118 #endif
119 #endif /* defined (__GNUC__) */
120 
121 #ifndef add_mssaaaa
122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
123   do {									\
124     UWtype __s0, __s1, __c0, __c1;					\
125     __s0 = (a0) + (b0);							\
126     __s1 = (a1) + (b1);							\
127     __c0 = __s0 < (a0);							\
128     __c1 = __s1 < (a1);							\
129     (s0) = __s0;							\
130     __s1 = __s1 + __c0;							\
131     (s1) = __s1;							\
132     (m) = - (__c1 + (__s1 < __c0));					\
133   } while (0)
134 #endif
135 
136 #if MOD_1_1P_METHOD == 1
137 void
138 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
139 {
140   mp_limb_t bi;
141   mp_limb_t B1modb, B2modb;
142   int cnt;
143 
144   count_leading_zeros (cnt, b);
145 
146   b <<= cnt;
147   invert_limb (bi, b);
148 
149   cps[0] = bi;
150   cps[1] = cnt;
151 
152   B1modb = -b;
153   if (LIKELY (cnt != 0))
154     B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
155   ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
156   cps[2] = B1modb >> cnt;
157 
158   /* In the normalized case, this can be simplified to
159    *
160    *   B2modb = - b * bi;
161    *   ASSERT (B2modb <= b);    // NB: equality iff b = B/2
162    */
163   udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
164   cps[3] = B2modb >> cnt;
165 }
166 
167 mp_limb_t
168 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
169 {
170   mp_limb_t rh, rl, bi, ph, pl, r;
171   mp_limb_t B1modb, B2modb;
172   mp_size_t i;
173   int cnt;
174   mp_limb_t mask;
175 
176   ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
177 
178   B1modb = bmodb[2];
179   B2modb = bmodb[3];
180 
181   rl = ap[n - 1];
182   umul_ppmm (ph, pl, rl, B1modb);
183   add_ssaaaa (rh, rl, ph, pl, 0, ap[n - 2]);
184 
185   for (i = n - 3; i >= 0; i -= 1)
186     {
187       /* rr = ap[i]				< B
188 	    + LO(rr)  * (B mod b)		<= (B-1)(b-1)
189 	    + HI(rr)  * (B^2 mod b)		<= (B-1)(b-1)
190       */
191       umul_ppmm (ph, pl, rl, B1modb);
192       add_ssaaaa (ph, pl, ph, pl, 0, ap[i]);
193 
194       umul_ppmm (rh, rl, rh, B2modb);
195       add_ssaaaa (rh, rl, rh, rl, ph, pl);
196     }
197 
198   cnt = bmodb[1];
199   bi = bmodb[0];
200 
201   if (LIKELY (cnt != 0))
202     rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
203 
204   mask = -(mp_limb_t) (rh >= b);
205   rh -= mask & b;
206 
207   udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
208 
209   return r >> cnt;
210 }
211 #endif /* MOD_1_1P_METHOD == 1 */
212 
213 #if MOD_1_1P_METHOD == 2
214 void
215 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
216 {
217   mp_limb_t bi;
218   mp_limb_t B2modb;
219   int cnt;
220 
221   count_leading_zeros (cnt, b);
222 
223   b <<= cnt;
224   invert_limb (bi, b);
225 
226   cps[0] = bi;
227   cps[1] = cnt;
228 
229   if (LIKELY (cnt != 0))
230     {
231       mp_limb_t B1modb = -b;
232       B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
233       ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
234       cps[2] = B1modb >> cnt;
235     }
236   B2modb = - b * bi;
237   ASSERT (B2modb <= b);    // NB: equality iff b = B/2
238   cps[3] = B2modb;
239 }
240 
241 mp_limb_t
242 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
243 {
244   int cnt;
245   mp_limb_t bi, B1modb;
246   mp_limb_t r0, r1;
247   mp_limb_t r;
248 
249   ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
250 
251   r0 = ap[n-2];
252   r1 = ap[n-1];
253 
254   if (n > 2)
255     {
256       mp_limb_t B2modb, B2mb;
257       mp_limb_t p0, p1;
258       mp_limb_t r2;
259       mp_size_t j;
260 
261       B2modb = bmodb[3];
262       B2mb = B2modb - b;
263 
264       umul_ppmm (p1, p0, r1, B2modb);
265       add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
266 
267       for (j = n-4; j >= 0; j--)
268 	{
269 	  mp_limb_t cy;
270 	  /* mp_limb_t t = r0 + B2mb; */
271 	  umul_ppmm (p1, p0, r1, B2modb);
272 
273 	  ADDC_LIMB (cy, r0, r0, r2 & B2modb);
274 	  /* Alternative, for cmov: if (cy) r0 = t; */
275 	  r0 -= (-cy) & b;
276 	  add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
277 	}
278 
279       r1 -= (r2 & b);
280     }
281 
282   cnt = bmodb[1];
283 
284   if (LIKELY (cnt != 0))
285     {
286       mp_limb_t t;
287       mp_limb_t B1modb = bmodb[2];
288 
289       umul_ppmm (r1, t, r1, B1modb);
290       r0 += t;
291       r1 += (r0 < t);
292 
293       /* Normalize */
294       r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
295       r0 <<= cnt;
296 
297       /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows
298 	 that. */
299     }
300   else
301     {
302       mp_limb_t mask = -(mp_limb_t) (r1 >= b);
303       r1 -= mask & b;
304     }
305 
306   bi = bmodb[0];
307 
308   udiv_rnnd_preinv (r, r1, r0, b, bi);
309   return r >> cnt;
310 }
311 #endif /* MOD_1_1P_METHOD == 2 */
312