1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
2
3 Contributed to the GNU project by Marco Bodrato.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26 or both in parallel, as here.
27
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
32
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
36
37
38 #include "gmp-impl.h"
39
40
41 #if GMP_NUMB_BITS < 21
42 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
43 #endif
44
45 #if GMP_NUMB_BITS < 16
46 #error Not implemented: divexact_by42525 needs splitting.
47 #endif
48
49 #if GMP_NUMB_BITS < 12
50 #error Not implemented: Hard to adapt...
51 #endif
52
53
54 /* FIXME: tuneup should decide the best variant */
55 #ifndef AORSMUL_FASTER_AORS_AORSLSH
56 #define AORSMUL_FASTER_AORS_AORSLSH 1
57 #endif
58 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
59 #define AORSMUL_FASTER_AORS_2AORSLSH 1
60 #endif
61 #ifndef AORSMUL_FASTER_2AORSLSH
62 #define AORSMUL_FASTER_2AORSLSH 1
63 #endif
64 #ifndef AORSMUL_FASTER_3AORSLSH
65 #define AORSMUL_FASTER_3AORSLSH 1
66 #endif
67
68
69 #if HAVE_NATIVE_mpn_sublsh_n
70 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
71 #else
72 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)73 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
74 {
75 #if USE_MUL_1 && 0
76 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
77 #else
78 mp_limb_t __cy;
79 __cy = mpn_lshift(ws,src,n,s);
80 return __cy + mpn_sub_n(dst,dst,ws,n);
81 #endif
82 }
83 #endif
84
85 #if HAVE_NATIVE_mpn_addlsh_n
86 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
87 #else
88 #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
89 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)90 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
91 {
92 #if USE_MUL_1 && 0
93 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
94 #else
95 mp_limb_t __cy;
96 __cy = mpn_lshift(ws,src,n,s);
97 return __cy + mpn_add_n(dst,dst,ws,n);
98 #endif
99 }
100 #endif
101 #endif
102
103 #if HAVE_NATIVE_mpn_subrsh
104 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
105 #else
106 /* FIXME: This is not a correct definition, it assumes no carry */
107 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
108 do { \
109 mp_limb_t __cy; \
110 MPN_DECR_U (dst, nd, src[0] >> s); \
111 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
112 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
113 } while (0)
114 #endif
115
116
117 #define BINVERT_9 \
118 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
119
120 #define BINVERT_255 \
121 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
122
123 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
124 #if GMP_LIMB_BITS == 32
125 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
126 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
127 #else
128 #if GMP_LIMB_BITS == 64
129 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
130 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
131 #endif
132 #endif
133
134 #ifndef mpn_divexact_by255
135 #if GMP_NUMB_BITS % 8 == 0
136 #define mpn_divexact_by255(dst,src,size) \
137 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
138 #else
139 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
140 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
141 #else
142 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
143 #endif
144 #endif
145 #endif
146
147 #ifndef mpn_divexact_by9x4
148 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
149 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
150 #else
151 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
152 #endif
153 #endif
154
155 #ifndef mpn_divexact_by42525
156 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
157 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
158 #else
159 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
160 #endif
161 #endif
162
163 #ifndef mpn_divexact_by2835x4
164 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
165 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
166 #else
167 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
168 #endif
169 #endif
170
171 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
172 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
173 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
174 degree 11 (or 10), given the 12 (rsp. 11) values:
175
176 r0 = limit at infinity of f(x) / x^11,
177 r1 = f(4),f(-4),
178 r2 = f(2),f(-2),
179 r3 = f(1),f(-1),
180 r4 = f(1/4),f(-1/4),
181 r5 = f(1/2),f(-1/2),
182 r6 = f(0).
183
184 All couples of the form f(n),f(-n) must be already mixed with
185 toom_couple_handling(f(n),...,f(-n),...)
186
187 The result is stored in {pp, spt + 7*n (or 6*n)}.
188 At entry, r6 is stored at {pp, 2n},
189 r4 is stored at {pp + 3n, 3n + 1}.
190 r2 is stored at {pp + 7n, 3n + 1}.
191 r0 is stored at {pp +11n, spt}.
192
193 The other values are 3n+1 limbs each (with most significant limbs small).
194
195 Negative intermediate results are stored two-complemented.
196 Inputs are destroyed.
197 */
198
199 void
mpn_toom_interpolate_12pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)200 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
201 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
202 {
203 mp_limb_t cy;
204 mp_size_t n3;
205 mp_size_t n3p1;
206 n3 = 3 * n;
207 n3p1 = n3 + 1;
208
209 #define r4 (pp + n3) /* 3n+1 */
210 #define r2 (pp + 7 * n) /* 3n+1 */
211 #define r0 (pp +11 * n) /* s+t <= 2*n */
212
213 /******************************* interpolation *****************************/
214 if (half != 0) {
215 cy = mpn_sub_n (r3, r3, r0, spt);
216 MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
217
218 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
219 MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
220 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
221
222 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
223 MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
224 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
225 };
226
227 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
228 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
229
230 #if HAVE_NATIVE_mpn_add_n_sub_n
231 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
232 #else
233 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
234 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
235 MP_PTR_SWAP(r1, wsi);
236 #endif
237
238 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
239 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
240
241 #if HAVE_NATIVE_mpn_add_n_sub_n
242 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
243 #else
244 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
245 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
246 MP_PTR_SWAP(r5, wsi);
247 #endif
248
249 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
250
251 #if AORSMUL_FASTER_AORS_AORSLSH
252 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
253 #else
254 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
255 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
256 #endif
257 /* A division by 2835x4 follows. Warning: the operand can be negative! */
258 mpn_divexact_by2835x4(r4, r4, n3p1);
259 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
260 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
261
262 #if AORSMUL_FASTER_2AORSLSH
263 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
264 #else
265 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
266 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
267 #endif
268 mpn_divexact_by255(r5, r5, n3p1);
269
270 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
271
272 #if AORSMUL_FASTER_3AORSLSH
273 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
274 #else
275 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
276 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
277 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
278 #endif
279 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
280 mpn_divexact_by42525(r1, r1, n3p1);
281
282 #if AORSMUL_FASTER_AORS_2AORSLSH
283 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
284 #else
285 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
286 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
287 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
288 #endif
289 mpn_divexact_by9x4(r2, r2, n3p1);
290
291 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
292
293 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
294 mpn_rsh1sub_n (r4, r2, r4, n3p1);
295 r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
296 #else
297 mpn_sub_n (r4, r2, r4, n3p1);
298 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
299 #endif
300 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
301
302 #ifdef HAVE_NATIVE_mpn_rsh1add_n
303 mpn_rsh1add_n (r5, r5, r1, n3p1);
304 r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
305 #else
306 mpn_add_n (r5, r5, r1, n3p1);
307 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
308 #endif
309
310 /* last interpolation steps... */
311 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
312 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
313 /* ... could be mixed with recomposition
314 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1|
315 */
316
317 /***************************** recomposition *******************************/
318 /*
319 pp[] prior to operations:
320 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
321
322 summation scheme for remaining operations:
323 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
324 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
325 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5|
326 */
327
328 cy = mpn_add_n (pp + n, pp + n, r5, n);
329 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
330 #if HAVE_NATIVE_mpn_add_nc
331 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
332 #else
333 MPN_INCR_U (r5 + 2 * n, n + 1, cy);
334 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
335 #endif
336 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
337
338 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
339 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
340 #if HAVE_NATIVE_mpn_add_nc
341 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
342 #else
343 MPN_INCR_U (r3 + 2 * n, n + 1, cy);
344 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
345 #endif
346 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
347
348 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
349 if (half) {
350 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
351 #if HAVE_NATIVE_mpn_add_nc
352 if (LIKELY (spt > n)) {
353 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
354 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
355 } else {
356 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
357 }
358 #else
359 MPN_INCR_U (r1 + 2 * n, n + 1, cy);
360 if (LIKELY (spt > n)) {
361 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
362 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
363 } else {
364 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
365 }
366 #endif
367 } else {
368 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
369 }
370
371 #undef r0
372 #undef r2
373 #undef r4
374 }
375