1*86d7f5d3SJohn Marino /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
2*86d7f5d3SJohn Marino
3*86d7f5d3SJohn Marino Contributed to the GNU project by Marco Bodrato.
4*86d7f5d3SJohn Marino
5*86d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6*86d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7*86d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8*86d7f5d3SJohn Marino
9*86d7f5d3SJohn Marino Copyright 2009, 2010 Free Software Foundation, Inc.
10*86d7f5d3SJohn Marino
11*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
12*86d7f5d3SJohn Marino
13*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
14*86d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
15*86d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
16*86d7f5d3SJohn Marino option) any later version.
17*86d7f5d3SJohn Marino
18*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
19*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20*86d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21*86d7f5d3SJohn Marino License for more details.
22*86d7f5d3SJohn Marino
23*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
24*86d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25*86d7f5d3SJohn Marino
26*86d7f5d3SJohn Marino #include "gmp.h"
27*86d7f5d3SJohn Marino #include "gmp-impl.h"
28*86d7f5d3SJohn Marino
29*86d7f5d3SJohn Marino /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
30*86d7f5d3SJohn Marino #ifndef mpn_divexact_by3
31*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
32*86d7f5d3SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
33*86d7f5d3SJohn Marino #else
34*86d7f5d3SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
35*86d7f5d3SJohn Marino #endif
36*86d7f5d3SJohn Marino #endif
37*86d7f5d3SJohn Marino
38*86d7f5d3SJohn Marino /* Interpolation for Toom-3.5, using the evaluation points: infinity,
39*86d7f5d3SJohn Marino 1, -1, 2, -2. More precisely, we want to compute
40*86d7f5d3SJohn Marino f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
41*86d7f5d3SJohn Marino six values
42*86d7f5d3SJohn Marino
43*86d7f5d3SJohn Marino w5 = f(0),
44*86d7f5d3SJohn Marino w4 = f(-1),
45*86d7f5d3SJohn Marino w3 = f(1)
46*86d7f5d3SJohn Marino w2 = f(-2),
47*86d7f5d3SJohn Marino w1 = f(2),
48*86d7f5d3SJohn Marino w0 = limit at infinity of f(x) / x^5,
49*86d7f5d3SJohn Marino
50*86d7f5d3SJohn Marino The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
51*86d7f5d3SJohn Marino {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
52*86d7f5d3SJohn Marino {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
53*86d7f5d3SJohn Marino significant limbs small). f(-1) and f(-2) may be negative, signs
54*86d7f5d3SJohn Marino determined by the flag bits. All intermediate results are positive.
55*86d7f5d3SJohn Marino Inputs are destroyed.
56*86d7f5d3SJohn Marino
57*86d7f5d3SJohn Marino Interpolation sequence was taken from the paper: "Integer and
58*86d7f5d3SJohn Marino Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
59*86d7f5d3SJohn Marino Some slight variations were introduced: adaptation to "gmp
60*86d7f5d3SJohn Marino instruction set", and a final saving of an operation by interlacing
61*86d7f5d3SJohn Marino interpolation and recomposition phases.
62*86d7f5d3SJohn Marino */
63*86d7f5d3SJohn Marino
64*86d7f5d3SJohn Marino void
mpn_toom_interpolate_6pts(mp_ptr pp,mp_size_t n,enum toom6_flags flags,mp_ptr w4,mp_ptr w2,mp_ptr w1,mp_size_t w0n)65*86d7f5d3SJohn Marino mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
66*86d7f5d3SJohn Marino mp_ptr w4, mp_ptr w2, mp_ptr w1,
67*86d7f5d3SJohn Marino mp_size_t w0n)
68*86d7f5d3SJohn Marino {
69*86d7f5d3SJohn Marino mp_limb_t cy;
70*86d7f5d3SJohn Marino /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
71*86d7f5d3SJohn Marino mp_limb_t cy4, cy6, embankment;
72*86d7f5d3SJohn Marino
73*86d7f5d3SJohn Marino ASSERT( n > 0 );
74*86d7f5d3SJohn Marino ASSERT( 2*n >= w0n && w0n > 0 );
75*86d7f5d3SJohn Marino
76*86d7f5d3SJohn Marino #define w5 pp /* 2n */
77*86d7f5d3SJohn Marino #define w3 (pp + 2 * n) /* 2n+1 */
78*86d7f5d3SJohn Marino #define w0 (pp + 5 * n) /* w0n */
79*86d7f5d3SJohn Marino
80*86d7f5d3SJohn Marino /* Interpolate with sequence:
81*86d7f5d3SJohn Marino W2 =(W1 - W2)>>2
82*86d7f5d3SJohn Marino W1 =(W1 - W5)>>1
83*86d7f5d3SJohn Marino W1 =(W1 - W2)>>1
84*86d7f5d3SJohn Marino W4 =(W3 - W4)>>1
85*86d7f5d3SJohn Marino W2 =(W2 - W4)/3
86*86d7f5d3SJohn Marino W3 = W3 - W4 - W5
87*86d7f5d3SJohn Marino W1 =(W1 - W3)/3
88*86d7f5d3SJohn Marino // Last steps are mixed with recomposition...
89*86d7f5d3SJohn Marino W2 = W2 - W0<<2
90*86d7f5d3SJohn Marino W4 = W4 - W2
91*86d7f5d3SJohn Marino W3 = W3 - W1
92*86d7f5d3SJohn Marino W2 = W2 - W0
93*86d7f5d3SJohn Marino */
94*86d7f5d3SJohn Marino
95*86d7f5d3SJohn Marino /* W2 =(W1 - W2)>>2 */
96*86d7f5d3SJohn Marino if (flags & toom6_vm2_neg)
97*86d7f5d3SJohn Marino mpn_add_n (w2, w1, w2, 2 * n + 1);
98*86d7f5d3SJohn Marino else
99*86d7f5d3SJohn Marino mpn_sub_n (w2, w1, w2, 2 * n + 1);
100*86d7f5d3SJohn Marino mpn_rshift (w2, w2, 2 * n + 1, 2);
101*86d7f5d3SJohn Marino
102*86d7f5d3SJohn Marino /* W1 =(W1 - W5)>>1 */
103*86d7f5d3SJohn Marino w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
104*86d7f5d3SJohn Marino mpn_rshift (w1, w1, 2 * n + 1, 1);
105*86d7f5d3SJohn Marino
106*86d7f5d3SJohn Marino /* W1 =(W1 - W2)>>1 */
107*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1sub_n
108*86d7f5d3SJohn Marino mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
109*86d7f5d3SJohn Marino #else
110*86d7f5d3SJohn Marino mpn_sub_n (w1, w1, w2, 2 * n + 1);
111*86d7f5d3SJohn Marino mpn_rshift (w1, w1, 2 * n + 1, 1);
112*86d7f5d3SJohn Marino #endif
113*86d7f5d3SJohn Marino
114*86d7f5d3SJohn Marino /* W4 =(W3 - W4)>>1 */
115*86d7f5d3SJohn Marino if (flags & toom6_vm1_neg)
116*86d7f5d3SJohn Marino {
117*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_n
118*86d7f5d3SJohn Marino mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
119*86d7f5d3SJohn Marino #else
120*86d7f5d3SJohn Marino mpn_add_n (w4, w3, w4, 2 * n + 1);
121*86d7f5d3SJohn Marino mpn_rshift (w4, w4, 2 * n + 1, 1);
122*86d7f5d3SJohn Marino #endif
123*86d7f5d3SJohn Marino }
124*86d7f5d3SJohn Marino else
125*86d7f5d3SJohn Marino {
126*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1sub_n
127*86d7f5d3SJohn Marino mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
128*86d7f5d3SJohn Marino #else
129*86d7f5d3SJohn Marino mpn_sub_n (w4, w3, w4, 2 * n + 1);
130*86d7f5d3SJohn Marino mpn_rshift (w4, w4, 2 * n + 1, 1);
131*86d7f5d3SJohn Marino #endif
132*86d7f5d3SJohn Marino }
133*86d7f5d3SJohn Marino
134*86d7f5d3SJohn Marino /* W2 =(W2 - W4)/3 */
135*86d7f5d3SJohn Marino mpn_sub_n (w2, w2, w4, 2 * n + 1);
136*86d7f5d3SJohn Marino mpn_divexact_by3 (w2, w2, 2 * n + 1);
137*86d7f5d3SJohn Marino
138*86d7f5d3SJohn Marino /* W3 = W3 - W4 - W5 */
139*86d7f5d3SJohn Marino mpn_sub_n (w3, w3, w4, 2 * n + 1);
140*86d7f5d3SJohn Marino w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
141*86d7f5d3SJohn Marino
142*86d7f5d3SJohn Marino /* W1 =(W1 - W3)/3 */
143*86d7f5d3SJohn Marino mpn_sub_n (w1, w1, w3, 2 * n + 1);
144*86d7f5d3SJohn Marino mpn_divexact_by3 (w1, w1, 2 * n + 1);
145*86d7f5d3SJohn Marino
146*86d7f5d3SJohn Marino /*
147*86d7f5d3SJohn Marino [1 0 0 0 0 0;
148*86d7f5d3SJohn Marino 0 1 0 0 0 0;
149*86d7f5d3SJohn Marino 1 0 1 0 0 0;
150*86d7f5d3SJohn Marino 0 1 0 1 0 0;
151*86d7f5d3SJohn Marino 1 0 1 0 1 0;
152*86d7f5d3SJohn Marino 0 0 0 0 0 1]
153*86d7f5d3SJohn Marino
154*86d7f5d3SJohn Marino pp[] prior to operations:
155*86d7f5d3SJohn Marino |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
156*86d7f5d3SJohn Marino
157*86d7f5d3SJohn Marino summation scheme for remaining operations:
158*86d7f5d3SJohn Marino |______________5|n_____4|n_____3|n_____2|n______|n______|pp
159*86d7f5d3SJohn Marino |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
160*86d7f5d3SJohn Marino || H w4 | L w4 |
161*86d7f5d3SJohn Marino || H w2 | L w2 |
162*86d7f5d3SJohn Marino || H w1 | L w1 |
163*86d7f5d3SJohn Marino ||-H w1 |-L w1 |
164*86d7f5d3SJohn Marino |-H w0 |-L w0 ||-H w2 |-L w2 |
165*86d7f5d3SJohn Marino */
166*86d7f5d3SJohn Marino cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
167*86d7f5d3SJohn Marino MPN_INCR_U (pp + 3 * n + 1, n, cy);
168*86d7f5d3SJohn Marino
169*86d7f5d3SJohn Marino /* W2 -= W0<<2 */
170*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n
171*86d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sublsh2_n
172*86d7f5d3SJohn Marino cy = mpn_sublsh2_n(w2, w2, w0, w0n);
173*86d7f5d3SJohn Marino #else
174*86d7f5d3SJohn Marino cy = mpn_sublsh_n(w2, w2, w0, w0n, 2);
175*86d7f5d3SJohn Marino #endif
176*86d7f5d3SJohn Marino #else
177*86d7f5d3SJohn Marino /* {W4,2*n+1} is now free and can be overwritten. */
178*86d7f5d3SJohn Marino cy = mpn_lshift(w4, w0, w0n, 2);
179*86d7f5d3SJohn Marino cy+= mpn_sub_n(w2, w2, w4, w0n);
180*86d7f5d3SJohn Marino #endif
181*86d7f5d3SJohn Marino MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
182*86d7f5d3SJohn Marino
183*86d7f5d3SJohn Marino /* W4L = W4L - W2L */
184*86d7f5d3SJohn Marino cy = mpn_sub_n (pp + n, pp + n, w2, n);
185*86d7f5d3SJohn Marino MPN_DECR_U (w3, 2 * n + 1, cy);
186*86d7f5d3SJohn Marino
187*86d7f5d3SJohn Marino /* W3H = W3H + W2L */
188*86d7f5d3SJohn Marino cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
189*86d7f5d3SJohn Marino /* W1L + W2H */
190*86d7f5d3SJohn Marino cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
191*86d7f5d3SJohn Marino MPN_INCR_U (w1 + n, n + 1, cy);
192*86d7f5d3SJohn Marino
193*86d7f5d3SJohn Marino /* W0 = W0 + W1H */
194*86d7f5d3SJohn Marino if (LIKELY (w0n > n))
195*86d7f5d3SJohn Marino cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
196*86d7f5d3SJohn Marino else
197*86d7f5d3SJohn Marino cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
198*86d7f5d3SJohn Marino
199*86d7f5d3SJohn Marino /*
200*86d7f5d3SJohn Marino summation scheme for the next operation:
201*86d7f5d3SJohn Marino |...____5|n_____4|n_____3|n_____2|n______|n______|pp
202*86d7f5d3SJohn Marino |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
203*86d7f5d3SJohn Marino ...-w0___|-w1_w2 |
204*86d7f5d3SJohn Marino */
205*86d7f5d3SJohn Marino /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
206*86d7f5d3SJohn Marino cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
207*86d7f5d3SJohn Marino
208*86d7f5d3SJohn Marino /* embankment is a "dirty trick" to avoid carry/borrow propagation
209*86d7f5d3SJohn Marino beyond allocated memory */
210*86d7f5d3SJohn Marino embankment = w0[w0n - 1] - 1;
211*86d7f5d3SJohn Marino w0[w0n - 1] = 1;
212*86d7f5d3SJohn Marino if (LIKELY (w0n > n)) {
213*86d7f5d3SJohn Marino if ( LIKELY(cy4 > cy6) )
214*86d7f5d3SJohn Marino MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
215*86d7f5d3SJohn Marino else
216*86d7f5d3SJohn Marino MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
217*86d7f5d3SJohn Marino MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
218*86d7f5d3SJohn Marino MPN_INCR_U (w0 + n, w0n - n, cy6);
219*86d7f5d3SJohn Marino } else {
220*86d7f5d3SJohn Marino MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
221*86d7f5d3SJohn Marino MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
222*86d7f5d3SJohn Marino }
223*86d7f5d3SJohn Marino w0[w0n - 1] += embankment;
224*86d7f5d3SJohn Marino
225*86d7f5d3SJohn Marino #undef w5
226*86d7f5d3SJohn Marino #undef w3
227*86d7f5d3SJohn Marino #undef w0
228*86d7f5d3SJohn Marino
229*86d7f5d3SJohn Marino }
230