1*86d7f5d3SJohn Marino /* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42.
2*86d7f5d3SJohn Marino
3*86d7f5d3SJohn Marino Contributed to the GNU project by Robert Harley.
4*86d7f5d3SJohn Marino Improvements by Paul Zimmermann and Marco Bodrato.
5*86d7f5d3SJohn Marino
6*86d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7*86d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8*86d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9*86d7f5d3SJohn Marino
10*86d7f5d3SJohn Marino Copyright 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2009 Free Software
11*86d7f5d3SJohn Marino Foundation, Inc.
12*86d7f5d3SJohn Marino
13*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
14*86d7f5d3SJohn Marino
15*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify it
16*86d7f5d3SJohn Marino under the terms of the GNU Lesser General Public License as published by the
17*86d7f5d3SJohn Marino Free Software Foundation; either version 3 of the License, or (at your
18*86d7f5d3SJohn Marino option) any later version.
19*86d7f5d3SJohn Marino
20*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
21*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22*86d7f5d3SJohn Marino FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
23*86d7f5d3SJohn Marino for more details.
24*86d7f5d3SJohn Marino
25*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
26*86d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
27*86d7f5d3SJohn Marino
28*86d7f5d3SJohn Marino #include "gmp.h"
29*86d7f5d3SJohn Marino #include "gmp-impl.h"
30*86d7f5d3SJohn Marino
31*86d7f5d3SJohn Marino void
mpn_toom_interpolate_5pts(mp_ptr c,mp_ptr v2,mp_ptr vm1,mp_size_t k,mp_size_t twor,int sa,mp_limb_t vinf0)32*86d7f5d3SJohn Marino mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
33*86d7f5d3SJohn Marino mp_size_t k, mp_size_t twor, int sa,
34*86d7f5d3SJohn Marino mp_limb_t vinf0)
35*86d7f5d3SJohn Marino {
36*86d7f5d3SJohn Marino mp_limb_t cy, saved;
37*86d7f5d3SJohn Marino mp_size_t twok;
38*86d7f5d3SJohn Marino mp_size_t kk1;
39*86d7f5d3SJohn Marino mp_ptr c1, v1, c3, vinf;
40*86d7f5d3SJohn Marino
41*86d7f5d3SJohn Marino twok = k + k;
42*86d7f5d3SJohn Marino kk1 = twok + 1;
43*86d7f5d3SJohn Marino
44*86d7f5d3SJohn Marino c1 = c + k;
45*86d7f5d3SJohn Marino v1 = c1 + k;
46*86d7f5d3SJohn Marino c3 = v1 + k;
47*86d7f5d3SJohn Marino vinf = c3 + k;
48*86d7f5d3SJohn Marino
49*86d7f5d3SJohn Marino #define v0 (c)
50*86d7f5d3SJohn Marino /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) =
51*86d7f5d3SJohn Marino thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0)
52*86d7f5d3SJohn Marino */
53*86d7f5d3SJohn Marino if (sa)
54*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
55*86d7f5d3SJohn Marino else
56*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
57*86d7f5d3SJohn Marino
58*86d7f5d3SJohn Marino /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
59*86d7f5d3SJohn Marino v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */
60*86d7f5d3SJohn Marino
61*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */
62*86d7f5d3SJohn Marino /* (5 3 1 1 0)*/
63*86d7f5d3SJohn Marino
64*86d7f5d3SJohn Marino /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
65*86d7f5d3SJohn Marino v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */
66*86d7f5d3SJohn Marino
67*86d7f5d3SJohn Marino /* (2) vm1 <- tm1 := (v1 - vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
68*86d7f5d3SJohn Marino tm1 >= 0 (0 1 0 1 0)
69*86d7f5d3SJohn Marino No carry comes out from {v1, kk1} +/- {vm1, kk1},
70*86d7f5d3SJohn Marino and the division by two is exact.
71*86d7f5d3SJohn Marino If (sa!=0) the sign of vm1 is negative */
72*86d7f5d3SJohn Marino if (sa)
73*86d7f5d3SJohn Marino {
74*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1add_n
75*86d7f5d3SJohn Marino mpn_rsh1add_n (vm1, v1, vm1, kk1);
76*86d7f5d3SJohn Marino #else
77*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
78*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
79*86d7f5d3SJohn Marino #endif
80*86d7f5d3SJohn Marino }
81*86d7f5d3SJohn Marino else
82*86d7f5d3SJohn Marino {
83*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1sub_n
84*86d7f5d3SJohn Marino mpn_rsh1sub_n (vm1, v1, vm1, kk1);
85*86d7f5d3SJohn Marino #else
86*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
87*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
88*86d7f5d3SJohn Marino #endif
89*86d7f5d3SJohn Marino }
90*86d7f5d3SJohn Marino
91*86d7f5d3SJohn Marino /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
92*86d7f5d3SJohn Marino v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */
93*86d7f5d3SJohn Marino
94*86d7f5d3SJohn Marino /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
95*86d7f5d3SJohn Marino t1 >= 0
96*86d7f5d3SJohn Marino */
97*86d7f5d3SJohn Marino vinf[0] -= mpn_sub_n (v1, v1, c, twok);
98*86d7f5d3SJohn Marino
99*86d7f5d3SJohn Marino /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
100*86d7f5d3SJohn Marino v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */
101*86d7f5d3SJohn Marino
102*86d7f5d3SJohn Marino /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
103*86d7f5d3SJohn Marino t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
104*86d7f5d3SJohn Marino */
105*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1sub_n
106*86d7f5d3SJohn Marino mpn_rsh1sub_n (v2, v2, v1, kk1);
107*86d7f5d3SJohn Marino #else
108*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
109*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
110*86d7f5d3SJohn Marino #endif
111*86d7f5d3SJohn Marino
112*86d7f5d3SJohn Marino /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
113*86d7f5d3SJohn Marino v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */
114*86d7f5d3SJohn Marino
115*86d7f5d3SJohn Marino /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
116*86d7f5d3SJohn Marino result is v1 >= 0
117*86d7f5d3SJohn Marino */
118*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
119*86d7f5d3SJohn Marino
120*86d7f5d3SJohn Marino /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
121*86d7f5d3SJohn Marino cy = mpn_add_n (c1, c1, vm1, kk1);
122*86d7f5d3SJohn Marino MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
123*86d7f5d3SJohn Marino /* Memory allocated for vm1 is now free, it can be recycled ...*/
124*86d7f5d3SJohn Marino
125*86d7f5d3SJohn Marino /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
126*86d7f5d3SJohn Marino result is v2 >= 0 */
127*86d7f5d3SJohn Marino saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */
128*86d7f5d3SJohn Marino vinf[0] = vinf0; /* Set the right value for vinf0 */
129*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_sublsh1_n
130*86d7f5d3SJohn Marino cy = mpn_sublsh1_n (v2, v2, vinf, twor);
131*86d7f5d3SJohn Marino #else
132*86d7f5d3SJohn Marino /* Overwrite unused vm1 */
133*86d7f5d3SJohn Marino cy = mpn_lshift (vm1, vinf, twor, 1);
134*86d7f5d3SJohn Marino cy += mpn_sub_n (v2, v2, vm1, twor);
135*86d7f5d3SJohn Marino #endif
136*86d7f5d3SJohn Marino MPN_DECR_U (v2 + twor, kk1 - twor, cy);
137*86d7f5d3SJohn Marino
138*86d7f5d3SJohn Marino /* Current matrix is
139*86d7f5d3SJohn Marino [1 0 0 0 0; vinf
140*86d7f5d3SJohn Marino 0 1 0 0 0; v2
141*86d7f5d3SJohn Marino 1 0 1 0 0; v1
142*86d7f5d3SJohn Marino 0 1 0 1 0; vm1
143*86d7f5d3SJohn Marino 0 0 0 0 1] v0
144*86d7f5d3SJohn Marino Some vaues already are in-place (we added vm1 in the correct position)
145*86d7f5d3SJohn Marino | vinf| v1 | v0 |
146*86d7f5d3SJohn Marino | vm1 |
147*86d7f5d3SJohn Marino One still is in a separated area
148*86d7f5d3SJohn Marino | +v2 |
149*86d7f5d3SJohn Marino We have to compute v1-=vinf; vm1 -= v2,
150*86d7f5d3SJohn Marino |-vinf|
151*86d7f5d3SJohn Marino | -v2 |
152*86d7f5d3SJohn Marino Carefully reordering operations we can avoid to compute twice the sum
153*86d7f5d3SJohn Marino of the high half of v2 plus the low half of vinf.
154*86d7f5d3SJohn Marino */
155*86d7f5d3SJohn Marino
156*86d7f5d3SJohn Marino /* Add the high half of t2 in {vinf} */
157*86d7f5d3SJohn Marino if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */
158*86d7f5d3SJohn Marino cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
159*86d7f5d3SJohn Marino MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
160*86d7f5d3SJohn Marino } else { /* triggered only by very unbalanced cases like
161*86d7f5d3SJohn Marino (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
162*86d7f5d3SJohn Marino ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
163*86d7f5d3SJohn Marino }
164*86d7f5d3SJohn Marino /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
165*86d7f5d3SJohn Marino result is >= 0 */
166*86d7f5d3SJohn Marino /* Side effect: we also subtracted (high half) vm1 -= v2 */
167*86d7f5d3SJohn Marino cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */
168*86d7f5d3SJohn Marino vinf0 = vinf[0]; /* Save again the right value for vinf0 */
169*86d7f5d3SJohn Marino vinf[0] = saved;
170*86d7f5d3SJohn Marino MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */
171*86d7f5d3SJohn Marino
172*86d7f5d3SJohn Marino /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
173*86d7f5d3SJohn Marino Operate only on the low half.
174*86d7f5d3SJohn Marino */
175*86d7f5d3SJohn Marino cy = mpn_sub_n (c1, c1, v2, k);
176*86d7f5d3SJohn Marino MPN_DECR_U (v1, kk1, cy);
177*86d7f5d3SJohn Marino
178*86d7f5d3SJohn Marino /********************* Beginning the final phase **********************/
179*86d7f5d3SJohn Marino
180*86d7f5d3SJohn Marino /* Most of the recomposition was done */
181*86d7f5d3SJohn Marino
182*86d7f5d3SJohn Marino /* add t2 in {c+3k, ...}, but only the low half */
183*86d7f5d3SJohn Marino cy = mpn_add_n (c3, c3, v2, k);
184*86d7f5d3SJohn Marino vinf[0] += cy;
185*86d7f5d3SJohn Marino ASSERT(vinf[0] >= cy); /* No carry */
186*86d7f5d3SJohn Marino MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
187*86d7f5d3SJohn Marino
188*86d7f5d3SJohn Marino #undef v0
189*86d7f5d3SJohn Marino }
190