xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/div_qr_1n_pi1.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_div_qr_1n_pi1
2 
3    Contributed to the GNU project by Niels Möller
4 
5    THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 
10 Copyright 2013 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 #include "gmp-impl.h"
39 #include "longlong.h"
40 
41 #if GMP_NAIL_BITS > 0
42 #error Nail bits not supported
43 #endif
44 
45 #ifndef DIV_QR_1N_METHOD
46 #define DIV_QR_1N_METHOD 2
47 #endif
48 
49 /* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
50 
51 #if defined (__GNUC__) && ! defined (NO_ASM)
52 
53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
55   __asm__ (  "add	%6, %k2\n\t"					\
56 	     "adc	%4, %k1\n\t"					\
57 	     "sbb	%k0, %k0"					\
58 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
59 	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
60 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
62 
63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
65   __asm__ (  "add	%6, %q2\n\t"					\
66 	     "adc	%4, %q1\n\t"					\
67 	     "sbb	%q0, %q0"					\
68 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
69 	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
70 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72 
73 #if defined (__sparc__) && W_TYPE_SIZE == 32
74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
75   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
76 	     "addxcc	%r3, %4, %1\n\t"				\
77 	     "subx	%%g0, %%g0, %0"					\
78 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
79 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
80 	 __CLOBBER_CC)
81 #endif
82 
83 #if defined (__sparc__) && W_TYPE_SIZE == 64
84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
85   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
86 	     "addccc	%r7, %8, %%g0\n\t"				\
87 	     "addccc	%r3, %4, %1\n\t"				\
88 	     "clr	%0\n\t"						\
89 	     "movcs	%%xcc, -1, %0"					\
90 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
91 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
92 	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
93 	 __CLOBBER_CC)
94 #if __VIS__ >= 0x300
95 #undef add_mssaaaa
96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
97   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
98 	     "addxccc	%r3, %4, %1\n\t"				\
99 	     "clr	%0\n\t"						\
100 	     "movcs	%%xcc, -1, %0"					\
101 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
102 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
103 	 __CLOBBER_CC)
104 #endif
105 #endif
106 
107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
109    processor running in 32-bit mode, since the carry flag then gets the 32-bit
110    carry.  */
111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
112   __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
113 	     "adde	%1, %3, %4\n\t"					\
114 	     "subfe	%0, %0, %0\n\t"					\
115 	     "nor	%0, %0, %0"					\
116 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
117 	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)			\
118 	   __CLOBBER_CC)
119 #endif
120 
121 #if defined (__s390x__) && W_TYPE_SIZE == 64
122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
123   __asm__ (  "algr	%2, %6\n\t"					\
124 	     "alcgr	%1, %4\n\t"					\
125 	     "lghi	%0, 0\n\t"					\
126 	     "alcgr	%0, %0\n\t"					\
127 	     "lcgr	%0, %0"						\
128 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
129 	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
130 	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
131 #endif
132 
133 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
134 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
135   __asm__ (  "adds	%2, %5, %6\n\t"					\
136 	     "adcs	%1, %3, %4\n\t"					\
137 	     "movcc	%0, #0\n\t"					\
138 	     "movcs	%0, #-1"					\
139 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
140 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
141 #endif
142 #endif /* defined (__GNUC__) */
143 
144 #ifndef add_mssaaaa
145 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
146   do {									\
147     UWtype __s0, __s1, __c0, __c1;					\
148     __s0 = (a0) + (b0);							\
149     __s1 = (a1) + (b1);							\
150     __c0 = __s0 < (a0);							\
151     __c1 = __s1 < (a1);							\
152     (s0) = __s0;							\
153     __s1 = __s1 + __c0;							\
154     (s1) = __s1;							\
155     (m) = - (__c1 + (__s1 < __c0));					\
156   } while (0)
157 #endif
158 
159 #if DIV_QR_1N_METHOD == 1
160 
161 /* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
162    Requires that uh < d. */
163 mp_limb_t
mpn_div_qr_1n_pi1(mp_ptr qp,mp_srcptr up,mp_size_t n,mp_limb_t uh,mp_limb_t d,mp_limb_t dinv)164 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
165 		   mp_limb_t d, mp_limb_t dinv)
166 {
167   ASSERT (n > 0);
168   ASSERT (uh < d);
169   ASSERT (d & GMP_NUMB_HIGHBIT);
170   ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
171 
172   do
173     {
174       mp_limb_t q, ul;
175 
176       ul = up[--n];
177       udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
178       qp[n] = q;
179     }
180   while (n > 0);
181 
182   return uh;
183 }
184 
185 #elif DIV_QR_1N_METHOD == 2
186 
187 mp_limb_t
mpn_div_qr_1n_pi1(mp_ptr qp,mp_srcptr up,mp_size_t n,mp_limb_t u1,mp_limb_t d,mp_limb_t dinv)188 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
189 		   mp_limb_t d, mp_limb_t dinv)
190 {
191   mp_limb_t B2;
192   mp_limb_t u0, u2;
193   mp_limb_t q0, q1;
194   mp_limb_t p0, p1;
195   mp_limb_t t;
196   mp_size_t j;
197 
198   ASSERT (d & GMP_LIMB_HIGHBIT);
199   ASSERT (n > 0);
200   ASSERT (u1 < d);
201 
202   if (n == 1)
203     {
204       udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
205       return u1;
206     }
207 
208   /* FIXME: Could be precomputed */
209   B2 = -d*dinv;
210 
211   umul_ppmm (q1, q0, dinv, u1);
212   umul_ppmm (p1, p0, B2, u1);
213   q1 += u1;
214   ASSERT (q1 >= u1);
215   u0 = up[n-1];	/* Early read, to allow qp == up. */
216   qp[n-1] = q1;
217 
218   add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
219 
220   /* FIXME: Keep q1 in a variable between iterations, to reduce number
221      of memory accesses. */
222   for (j = n-2; j-- > 0; )
223     {
224       mp_limb_t q2, cy;
225 
226       /* Additions for the q update:
227        *	+-------+
228        *        |u1 * v |
229        *        +---+---+
230        *        | u1|
231        *    +---+---+
232        *    | 1 | v |  (conditional on u2)
233        *    +---+---+
234        *        | 1 |  (conditional on u0 + u2 B2 carry)
235        *        +---+
236        * +      | q0|
237        *   -+---+---+---+
238        *    | q2| q1| q0|
239        *    +---+---+---+
240       */
241       umul_ppmm (p1, t, u1, dinv);
242       add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
243       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
244       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
245       q0 = t;
246 
247       umul_ppmm (p1, p0, u1, B2);
248       ADDC_LIMB (cy, u0, u0, u2 & B2);
249       u0 -= (-cy) & d;
250 
251       /* Final q update */
252       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
253       qp[j+1] = q1;
254       MPN_INCR_U (qp+j+2, n-j-2, q2);
255 
256       add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
257     }
258 
259   q1 = (u2 > 0);
260   u1 -= (-q1) & d;
261 
262   t = (u1 >= d);
263   q1 += t;
264   u1 -= (-t) & d;
265 
266   udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
267   add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
268 
269   MPN_INCR_U (qp+1, n-1, q1);
270 
271   qp[0] = q0;
272   return u0;
273 }
274 
275 #else
276 #error Unknown DIV_QR_1N_METHOD
277 #endif
278