1 /* mpn_div_qr_1n_pi1
2
3 Contributed to the GNU project by Niels Möller
4
5 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9
10 Copyright 2013 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21 or
22
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
26
27 or both in parallel, as here.
28
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
33
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
37
38 #include "gmp-impl.h"
39 #include "longlong.h"
40
41 #if GMP_NAIL_BITS > 0
42 #error Nail bits not supported
43 #endif
44
45 #ifndef DIV_QR_1N_METHOD
46 #define DIV_QR_1N_METHOD 2
47 #endif
48
49 /* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
50
51 #if defined (__GNUC__) && ! defined (NO_ASM)
52
53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
55 __asm__ ( "add %6, %k2\n\t" \
56 "adc %4, %k1\n\t" \
57 "sbb %k0, %k0" \
58 : "=r" (m), "=r" (s1), "=&r" (s0) \
59 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
60 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
62
63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
65 __asm__ ( "add %6, %q2\n\t" \
66 "adc %4, %q1\n\t" \
67 "sbb %q0, %q0" \
68 : "=r" (m), "=r" (s1), "=&r" (s0) \
69 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
70 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72
73 #if defined (__sparc__) && W_TYPE_SIZE == 32
74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
75 __asm__ ( "addcc %r5, %6, %2\n\t" \
76 "addxcc %r3, %4, %1\n\t" \
77 "subx %%g0, %%g0, %0" \
78 : "=r" (m), "=r" (sh), "=&r" (sl) \
79 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
80 __CLOBBER_CC)
81 #endif
82
83 #if defined (__sparc__) && W_TYPE_SIZE == 64
84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
85 __asm__ ( "addcc %r5, %6, %2\n\t" \
86 "addccc %r7, %8, %%g0\n\t" \
87 "addccc %r3, %4, %1\n\t" \
88 "clr %0\n\t" \
89 "movcs %%xcc, -1, %0" \
90 : "=r" (m), "=r" (sh), "=&r" (sl) \
91 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
92 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
93 __CLOBBER_CC)
94 #if __VIS__ >= 0x300
95 #undef add_mssaaaa
96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
97 __asm__ ( "addcc %r5, %6, %2\n\t" \
98 "addxccc %r3, %4, %1\n\t" \
99 "clr %0\n\t" \
100 "movcs %%xcc, -1, %0" \
101 : "=r" (m), "=r" (sh), "=&r" (sl) \
102 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
103 __CLOBBER_CC)
104 #endif
105 #endif
106
107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
109 processor running in 32-bit mode, since the carry flag then gets the 32-bit
110 carry. */
111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
112 __asm__ ( "add%I6c %2, %5, %6\n\t" \
113 "adde %1, %3, %4\n\t" \
114 "subfe %0, %0, %0\n\t" \
115 "nor %0, %0, %0" \
116 : "=r" (m), "=r" (s1), "=&r" (s0) \
117 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \
118 __CLOBBER_CC)
119 #endif
120
121 #if defined (__s390x__) && W_TYPE_SIZE == 64
122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
123 __asm__ ( "algr %2, %6\n\t" \
124 "alcgr %1, %4\n\t" \
125 "lghi %0, 0\n\t" \
126 "alcgr %0, %0\n\t" \
127 "lcgr %0, %0" \
128 : "=r" (m), "=r" (s1), "=&r" (s0) \
129 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
130 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
131 #endif
132
133 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
134 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
135 __asm__ ( "adds %2, %5, %6\n\t" \
136 "adcs %1, %3, %4\n\t" \
137 "movcc %0, #0\n\t" \
138 "movcs %0, #-1" \
139 : "=r" (m), "=r" (sh), "=&r" (sl) \
140 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
141 #endif
142 #endif /* defined (__GNUC__) */
143
144 #ifndef add_mssaaaa
145 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
146 do { \
147 UWtype __s0, __s1, __c0, __c1; \
148 __s0 = (a0) + (b0); \
149 __s1 = (a1) + (b1); \
150 __c0 = __s0 < (a0); \
151 __c1 = __s1 < (a1); \
152 (s0) = __s0; \
153 __s1 = __s1 + __c0; \
154 (s1) = __s1; \
155 (m) = - (__c1 + (__s1 < __c0)); \
156 } while (0)
157 #endif
158
159 #if DIV_QR_1N_METHOD == 1
160
161 /* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
162 Requires that uh < d. */
163 mp_limb_t
mpn_div_qr_1n_pi1(mp_ptr qp,mp_srcptr up,mp_size_t n,mp_limb_t uh,mp_limb_t d,mp_limb_t dinv)164 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
165 mp_limb_t d, mp_limb_t dinv)
166 {
167 ASSERT (n > 0);
168 ASSERT (uh < d);
169 ASSERT (d & GMP_NUMB_HIGHBIT);
170 ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
171
172 do
173 {
174 mp_limb_t q, ul;
175
176 ul = up[--n];
177 udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
178 qp[n] = q;
179 }
180 while (n > 0);
181
182 return uh;
183 }
184
185 #elif DIV_QR_1N_METHOD == 2
186
187 mp_limb_t
mpn_div_qr_1n_pi1(mp_ptr qp,mp_srcptr up,mp_size_t n,mp_limb_t u1,mp_limb_t d,mp_limb_t dinv)188 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
189 mp_limb_t d, mp_limb_t dinv)
190 {
191 mp_limb_t B2;
192 mp_limb_t u0, u2;
193 mp_limb_t q0, q1;
194 mp_limb_t p0, p1;
195 mp_limb_t t;
196 mp_size_t j;
197
198 ASSERT (d & GMP_LIMB_HIGHBIT);
199 ASSERT (n > 0);
200 ASSERT (u1 < d);
201
202 if (n == 1)
203 {
204 udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
205 return u1;
206 }
207
208 /* FIXME: Could be precomputed */
209 B2 = -d*dinv;
210
211 umul_ppmm (q1, q0, dinv, u1);
212 umul_ppmm (p1, p0, B2, u1);
213 q1 += u1;
214 ASSERT (q1 >= u1);
215 u0 = up[n-1]; /* Early read, to allow qp == up. */
216 qp[n-1] = q1;
217
218 add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
219
220 /* FIXME: Keep q1 in a variable between iterations, to reduce number
221 of memory accesses. */
222 for (j = n-2; j-- > 0; )
223 {
224 mp_limb_t q2, cy;
225
226 /* Additions for the q update:
227 * +-------+
228 * |u1 * v |
229 * +---+---+
230 * | u1|
231 * +---+---+
232 * | 1 | v | (conditional on u2)
233 * +---+---+
234 * | 1 | (conditional on u0 + u2 B2 carry)
235 * +---+
236 * + | q0|
237 * -+---+---+---+
238 * | q2| q1| q0|
239 * +---+---+---+
240 */
241 umul_ppmm (p1, t, u1, dinv);
242 add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
243 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
244 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
245 q0 = t;
246
247 umul_ppmm (p1, p0, u1, B2);
248 ADDC_LIMB (cy, u0, u0, u2 & B2);
249 u0 -= (-cy) & d;
250
251 /* Final q update */
252 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
253 qp[j+1] = q1;
254 MPN_INCR_U (qp+j+2, n-j-2, q2);
255
256 add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
257 }
258
259 q1 = (u2 > 0);
260 u1 -= (-q1) & d;
261
262 t = (u1 >= d);
263 q1 += t;
264 u1 -= (-t) & d;
265
266 udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
267 add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
268
269 MPN_INCR_U (qp+1, n-1, q1);
270
271 qp[0] = q0;
272 return u0;
273 }
274
275 #else
276 #error Unknown DIV_QR_1N_METHOD
277 #endif
278