1 /* Interpolaton for the algorithm Toom-Cook 6.5-way.
2
3 Contributed to the GNU project by Marco Bodrato.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2009, 2010 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21 License for more details.
22
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25
26
27 #include "gmp.h"
28 #include "gmp-impl.h"
29
30
31 #if HAVE_NATIVE_mpn_sublsh_n
32 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
33 #else
34 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)35 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
36 {
37 #if USE_MUL_1 && 0
38 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
39 #else
40 mp_limb_t __cy;
41 __cy = mpn_lshift(ws,src,n,s);
42 return __cy + mpn_sub_n(dst,dst,ws,n);
43 #endif
44 }
45 #endif
46
47 #if HAVE_NATIVE_mpn_addlsh_n
48 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
49 #else
50 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)51 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
52 {
53 #if USE_MUL_1 && 0
54 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
55 #else
56 mp_limb_t __cy;
57 __cy = mpn_lshift(ws,src,n,s);
58 return __cy + mpn_add_n(dst,dst,ws,n);
59 #endif
60 }
61 #endif
62
63 #if HAVE_NATIVE_mpn_subrsh
64 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
65 #else
66 /* FIXME: This is not a correct definition, it assumes no carry */
67 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
68 do { \
69 mp_limb_t __cy; \
70 MPN_DECR_U (dst, nd, src[0] >> s); \
71 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
72 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
73 } while (0)
74 #endif
75
76
77 #if GMP_NUMB_BITS < 21
78 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
79 #endif
80
81 #if GMP_NUMB_BITS < 16
82 #error Not implemented: divexact_by42525 needs splitting.
83 #endif
84
85 #if GMP_NUMB_BITS < 12
86 #error Not implemented: Hard to adapt...
87 #endif
88
89 /* FIXME: tuneup should decide the best variant */
90 #ifndef AORSMUL_FASTER_AORS_AORSLSH
91 #define AORSMUL_FASTER_AORS_AORSLSH 1
92 #endif
93 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
94 #define AORSMUL_FASTER_AORS_2AORSLSH 1
95 #endif
96 #ifndef AORSMUL_FASTER_2AORSLSH
97 #define AORSMUL_FASTER_2AORSLSH 1
98 #endif
99 #ifndef AORSMUL_FASTER_3AORSLSH
100 #define AORSMUL_FASTER_3AORSLSH 1
101 #endif
102
103 #define BINVERT_9 \
104 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
105
106 #define BINVERT_255 \
107 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
108
109 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
110 #if GMP_LIMB_BITS == 32
111 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
112 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
113 #else
114 #if GMP_LIMB_BITS == 64
115 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
116 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
117 #endif
118 #endif
119
120 #ifndef mpn_divexact_by255
121 #if GMP_NUMB_BITS % 8 == 0
122 #define mpn_divexact_by255(dst,src,size) \
123 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
124 #else
125 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
126 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
127 #else
128 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
129 #endif
130 #endif
131 #endif
132
133 #ifndef mpn_divexact_by9x4
134 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
135 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
136 #else
137 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
138 #endif
139 #endif
140
141 #ifndef mpn_divexact_by42525
142 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
143 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
144 #else
145 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
146 #endif
147 #endif
148
149 #ifndef mpn_divexact_by2835x4
150 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
151 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
152 #else
153 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
154 #endif
155 #endif
156
157 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
158 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
159 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
160 degree 11 (or 10), given the 12 (rsp. 11) values:
161
162 r0 = limit at infinity of f(x) / x^7,
163 r1 = f(4),f(-4),
164 r2 = f(2),f(-2),
165 r3 = f(1),f(-1),
166 r4 = f(1/4),f(-1/4),
167 r5 = f(1/2),f(-1/2),
168 r6 = f(0).
169
170 All couples of the form f(n),f(-n) must be already mixed with
171 toom_couple_handling(f(n),...,f(-n),...)
172
173 The result is stored in {pp, spt + 7*n (or 6*n)}.
174 At entry, r6 is stored at {pp, 2n},
175 r4 is stored at {pp + 3n, 3n + 1}.
176 r2 is stored at {pp + 7n, 3n + 1}.
177 r0 is stored at {pp +11n, spt}.
178
179 The other values are 3n+1 limbs each (with most significant limbs small).
180
181 Negative intermediate results are stored two-complemented.
182 Inputs are destroyed.
183 */
184
185 void
mpn_toom_interpolate_12pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)186 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
187 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
188 {
189 mp_limb_t cy;
190 mp_size_t n3;
191 mp_size_t n3p1;
192 n3 = 3 * n;
193 n3p1 = n3 + 1;
194
195 #define r4 (pp + n3) /* 3n+1 */
196 #define r2 (pp + 7 * n) /* 3n+1 */
197 #define r0 (pp +11 * n) /* s+t <= 2*n */
198
199 /******************************* interpolation *****************************/
200 if (half != 0) {
201 cy = mpn_sub_n (r3, r3, r0, spt);
202 MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
203
204 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
205 MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
206 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
207
208 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
209 MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
210 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
211 };
212
213 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
214 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
215
216 #if HAVE_NATIVE_mpn_add_n_sub_n
217 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
218 #else
219 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
220 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
221 MP_PTR_SWAP(r1, wsi);
222 #endif
223
224 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
225 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
226
227 #if HAVE_NATIVE_mpn_add_n_sub_n
228 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
229 #else
230 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
231 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
232 MP_PTR_SWAP(r5, wsi);
233 #endif
234
235 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
236
237 #if AORSMUL_FASTER_AORS_AORSLSH
238 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
239 #else
240 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
241 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
242 #endif
243 /* A division by 2835x4 followsi. Warning: the operand can be negative! */
244 mpn_divexact_by2835x4(r4, r4, n3p1);
245 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
246 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
247
248 #if AORSMUL_FASTER_2AORSLSH
249 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
250 #else
251 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
252 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
253 #endif
254 mpn_divexact_by255(r5, r5, n3p1);
255
256 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
257
258 #if AORSMUL_FASTER_3AORSLSH
259 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
260 #else
261 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
262 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
263 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
264 #endif
265 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
266 mpn_divexact_by42525(r1, r1, n3p1);
267
268 #if AORSMUL_FASTER_AORS_2AORSLSH
269 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
270 #else
271 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
272 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
273 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
274 #endif
275 mpn_divexact_by9x4(r2, r2, n3p1);
276
277 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
278
279 mpn_sub_n (r4, r2, r4, n3p1);
280 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
281 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
282
283 mpn_add_n (r5, r5, r1, n3p1);
284 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
285
286 /* last interpolation steps... */
287 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
288 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
289 /* ... could be mixed with recomposition
290 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1|
291 */
292
293 /***************************** recomposition *******************************/
294 /*
295 pp[] prior to operations:
296 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
297
298 summation scheme for remaining operations:
299 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
300 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
301 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5|
302 */
303
304 cy = mpn_add_n (pp + n, pp + n, r5, n);
305 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
306 #if HAVE_NATIVE_mpn_add_nc
307 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
308 #else
309 MPN_INCR_U (r5 + 2 * n, n + 1, cy);
310 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
311 #endif
312 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
313
314 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
315 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
316 #if HAVE_NATIVE_mpn_add_nc
317 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
318 #else
319 MPN_INCR_U (r3 + 2 * n, n + 1, cy);
320 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
321 #endif
322 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
323
324 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
325 if (half) {
326 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
327 #if HAVE_NATIVE_mpn_add_nc
328 if (LIKELY (spt > n)) {
329 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
330 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
331 } else {
332 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
333 }
334 #else
335 MPN_INCR_U (r1 + 2 * n, n + 1, cy);
336 if (LIKELY (spt > n)) {
337 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
338 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
339 } else {
340 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
341 }
342 #endif
343 } else {
344 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
345 }
346
347 #undef r0
348 #undef r2
349 #undef r4
350 }
351