xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/toom_interpolate_12pts.c (revision 1daf83e636cd998f45e5597a8f995a540e2d5b4a)
1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
2 
3    Contributed to the GNU project by Marco Bodrato.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 
38 #include "gmp-impl.h"
39 
40 
41 #if GMP_NUMB_BITS < 21
42 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
43 #endif
44 
45 #if GMP_NUMB_BITS < 16
46 #error Not implemented: divexact_by42525 needs splitting.
47 #endif
48 
49 #if GMP_NUMB_BITS < 12
50 #error Not implemented: Hard to adapt...
51 #endif
52 
53 
54 /* FIXME: tuneup should decide the best variant */
55 #ifndef AORSMUL_FASTER_AORS_AORSLSH
56 #define AORSMUL_FASTER_AORS_AORSLSH 1
57 #endif
58 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
59 #define AORSMUL_FASTER_AORS_2AORSLSH 1
60 #endif
61 #ifndef AORSMUL_FASTER_2AORSLSH
62 #define AORSMUL_FASTER_2AORSLSH 1
63 #endif
64 #ifndef AORSMUL_FASTER_3AORSLSH
65 #define AORSMUL_FASTER_3AORSLSH 1
66 #endif
67 
68 
69 #if HAVE_NATIVE_mpn_sublsh_n
70 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
71 #else
72 static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)73 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
74 {
75 #if USE_MUL_1 && 0
76   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
77 #else
78   mp_limb_t __cy;
79   __cy = mpn_lshift(ws,src,n,s);
80   return    __cy + mpn_sub_n(dst,dst,ws,n);
81 #endif
82 }
83 #endif
84 
85 #if HAVE_NATIVE_mpn_addlsh_n
86 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
87 #else
88 #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
89 static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst,mp_srcptr src,mp_size_t n,unsigned int s,mp_ptr ws)90 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
91 {
92 #if USE_MUL_1 && 0
93   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
94 #else
95   mp_limb_t __cy;
96   __cy = mpn_lshift(ws,src,n,s);
97   return    __cy + mpn_add_n(dst,dst,ws,n);
98 #endif
99 }
100 #endif
101 #endif
102 
103 #if HAVE_NATIVE_mpn_subrsh
104 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
105 #else
106 /* FIXME: This is not a correct definition, it assumes no carry */
107 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
108 do {									\
109   mp_limb_t __cy;							\
110   MPN_DECR_U (dst, nd, src[0] >> s);					\
111   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
112   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
113 } while (0)
114 #endif
115 
116 
117 #define BINVERT_9 \
118   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
119 
120 #define BINVERT_255 \
121   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
122 
123   /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
124 #if GMP_LIMB_BITS == 32
125 #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
126 #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
127 #else
128 #if GMP_LIMB_BITS == 64
129 #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
130 #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
131 #endif
132 #endif
133 
134 #ifndef mpn_divexact_by255
135 #if GMP_NUMB_BITS % 8 == 0
136 #define mpn_divexact_by255(dst,src,size) \
137   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
138 #else
139 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
140 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
141 #else
142 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
143 #endif
144 #endif
145 #endif
146 
147 #ifndef mpn_divexact_by9x4
148 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
149 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
150 #else
151 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
152 #endif
153 #endif
154 
155 #ifndef mpn_divexact_by42525
156 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
157 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
158 #else
159 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
160 #endif
161 #endif
162 
163 #ifndef mpn_divexact_by2835x4
164 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
165 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
166 #else
167 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
168 #endif
169 #endif
170 
171 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
172    points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
173    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
174    degree 11 (or 10), given the 12 (rsp. 11) values:
175 
176      r0 = limit at infinity of f(x) / x^11,
177      r1 = f(4),f(-4),
178      r2 = f(2),f(-2),
179      r3 = f(1),f(-1),
180      r4 = f(1/4),f(-1/4),
181      r5 = f(1/2),f(-1/2),
182      r6 = f(0).
183 
184    All couples of the form f(n),f(-n) must be already mixed with
185    toom_couple_handling(f(n),...,f(-n),...)
186 
187    The result is stored in {pp, spt + 7*n (or 6*n)}.
188    At entry, r6 is stored at {pp, 2n},
189    r4 is stored at {pp + 3n, 3n + 1}.
190    r2 is stored at {pp + 7n, 3n + 1}.
191    r0 is stored at {pp +11n, spt}.
192 
193    The other values are 3n+1 limbs each (with most significant limbs small).
194 
195    Negative intermediate results are stored two-complemented.
196    Inputs are destroyed.
197 */
198 
199 void
mpn_toom_interpolate_12pts(mp_ptr pp,mp_ptr r1,mp_ptr r3,mp_ptr r5,mp_size_t n,mp_size_t spt,int half,mp_ptr wsi)200 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
201 			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
202 {
203   mp_limb_t cy;
204   mp_size_t n3;
205   mp_size_t n3p1;
206   n3 = 3 * n;
207   n3p1 = n3 + 1;
208 
209 #define   r4    (pp + n3)			/* 3n+1 */
210 #define   r2    (pp + 7 * n)			/* 3n+1 */
211 #define   r0    (pp +11 * n)			/* s+t <= 2*n */
212 
213   /******************************* interpolation *****************************/
214   if (half != 0) {
215     cy = mpn_sub_n (r3, r3, r0, spt);
216     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
217 
218     cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
219     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
220     DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
221 
222     cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
223     MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
224     DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
225   };
226 
227   r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
228   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
229 
230 #if HAVE_NATIVE_mpn_add_n_sub_n
231   mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
232 #else
233   ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
234   mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
235   MP_PTR_SWAP(r1, wsi);
236 #endif
237 
238   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
239   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
240 
241 #if HAVE_NATIVE_mpn_add_n_sub_n
242   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
243 #else
244   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
245   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
246   MP_PTR_SWAP(r5, wsi);
247 #endif
248 
249   r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
250 
251 #if AORSMUL_FASTER_AORS_AORSLSH
252   mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
253 #else
254   mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
255   DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
256 #endif
257   /* A division by 2835x4 follows. Warning: the operand can be negative! */
258   mpn_divexact_by2835x4(r4, r4, n3p1);
259   if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
260     r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
261 
262 #if AORSMUL_FASTER_2AORSLSH
263   mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
264 #else
265   DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
266   DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
267 #endif
268   mpn_divexact_by255(r5, r5, n3p1);
269 
270   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
271 
272 #if AORSMUL_FASTER_3AORSLSH
273   ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
274 #else
275   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
276   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
277   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
278 #endif
279   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
280   mpn_divexact_by42525(r1, r1, n3p1);
281 
282 #if AORSMUL_FASTER_AORS_2AORSLSH
283   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
284 #else
285   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
286   ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
287   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
288 #endif
289   mpn_divexact_by9x4(r2, r2, n3p1);
290 
291   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
292 
293 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
294   mpn_rsh1sub_n (r4, r2, r4, n3p1);
295   r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
296 #else
297   mpn_sub_n (r4, r2, r4, n3p1);
298   ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
299 #endif
300   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
301 
302 #ifdef HAVE_NATIVE_mpn_rsh1add_n
303   mpn_rsh1add_n (r5, r5, r1, n3p1);
304   r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
305 #else
306   mpn_add_n (r5, r5, r1, n3p1);
307   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
308 #endif
309 
310   /* last interpolation steps... */
311   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
312   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
313   /* ... could be mixed with recomposition
314 	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
315   */
316 
317   /***************************** recomposition *******************************/
318   /*
319     pp[] prior to operations:
320     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
321 
322     summation scheme for remaining operations:
323     |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
324     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
325 	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
326   */
327 
328   cy = mpn_add_n (pp + n, pp + n, r5, n);
329   cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
330 #if HAVE_NATIVE_mpn_add_nc
331   cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
332 #else
333   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
334   cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
335 #endif
336   MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
337 
338   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
339   cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
340 #if HAVE_NATIVE_mpn_add_nc
341   cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
342 #else
343   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
344   cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
345 #endif
346   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
347 
348   pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
349   if (half) {
350     cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
351 #if HAVE_NATIVE_mpn_add_nc
352     if (LIKELY (spt > n)) {
353       cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
354       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
355     } else {
356       ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
357     }
358 #else
359     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
360     if (LIKELY (spt > n)) {
361       cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
362       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
363     } else {
364       ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
365     }
366 #endif
367   } else {
368     ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
369   }
370 
371 #undef   r0
372 #undef   r2
373 #undef   r4
374 }
375