xref: /dflybsd-src/contrib/gmp/mpn/generic/toom_interpolate_5pts.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
1*86d7f5d3SJohn Marino /* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42.
2*86d7f5d3SJohn Marino 
3*86d7f5d3SJohn Marino    Contributed to the GNU project by Robert Harley.
4*86d7f5d3SJohn Marino    Improvements by Paul Zimmermann and Marco Bodrato.
5*86d7f5d3SJohn Marino 
6*86d7f5d3SJohn Marino    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7*86d7f5d3SJohn Marino    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8*86d7f5d3SJohn Marino    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9*86d7f5d3SJohn Marino 
10*86d7f5d3SJohn Marino Copyright 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2009 Free Software
11*86d7f5d3SJohn Marino Foundation, Inc.
12*86d7f5d3SJohn Marino 
13*86d7f5d3SJohn Marino This file is part of the GNU MP Library.
14*86d7f5d3SJohn Marino 
15*86d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify it
16*86d7f5d3SJohn Marino under the terms of the GNU Lesser General Public License as published by the
17*86d7f5d3SJohn Marino Free Software Foundation; either version 3 of the License, or (at your
18*86d7f5d3SJohn Marino option) any later version.
19*86d7f5d3SJohn Marino 
20*86d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
21*86d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22*86d7f5d3SJohn Marino FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
23*86d7f5d3SJohn Marino for more details.
24*86d7f5d3SJohn Marino 
25*86d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
26*86d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27*86d7f5d3SJohn Marino 
28*86d7f5d3SJohn Marino #include "gmp.h"
29*86d7f5d3SJohn Marino #include "gmp-impl.h"
30*86d7f5d3SJohn Marino 
31*86d7f5d3SJohn Marino void
mpn_toom_interpolate_5pts(mp_ptr c,mp_ptr v2,mp_ptr vm1,mp_size_t k,mp_size_t twor,int sa,mp_limb_t vinf0)32*86d7f5d3SJohn Marino mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
33*86d7f5d3SJohn Marino 			   mp_size_t k, mp_size_t twor, int sa,
34*86d7f5d3SJohn Marino 			   mp_limb_t vinf0)
35*86d7f5d3SJohn Marino {
36*86d7f5d3SJohn Marino   mp_limb_t cy, saved;
37*86d7f5d3SJohn Marino   mp_size_t twok;
38*86d7f5d3SJohn Marino   mp_size_t kk1;
39*86d7f5d3SJohn Marino   mp_ptr c1, v1, c3, vinf;
40*86d7f5d3SJohn Marino 
41*86d7f5d3SJohn Marino   twok = k + k;
42*86d7f5d3SJohn Marino   kk1 = twok + 1;
43*86d7f5d3SJohn Marino 
44*86d7f5d3SJohn Marino   c1 = c  + k;
45*86d7f5d3SJohn Marino   v1 = c1 + k;
46*86d7f5d3SJohn Marino   c3 = v1 + k;
47*86d7f5d3SJohn Marino   vinf = c3 + k;
48*86d7f5d3SJohn Marino 
49*86d7f5d3SJohn Marino #define v0 (c)
50*86d7f5d3SJohn Marino   /* (1) v2 <- v2-vm1 < v2+|vm1|,       (16 8 4 2 1) - (1 -1 1 -1  1) =
51*86d7f5d3SJohn Marino      thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k)             (15 9 3  3  0)
52*86d7f5d3SJohn Marino   */
53*86d7f5d3SJohn Marino   if (sa)
54*86d7f5d3SJohn Marino     ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
55*86d7f5d3SJohn Marino   else
56*86d7f5d3SJohn Marino     ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
57*86d7f5d3SJohn Marino 
58*86d7f5d3SJohn Marino   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
59*86d7f5d3SJohn Marino        v0       v1       hi(vinf)       |vm1|     v2-vm1      EMPTY */
60*86d7f5d3SJohn Marino 
61*86d7f5d3SJohn Marino   ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
62*86d7f5d3SJohn Marino 						      /* (5 3 1 1 0)*/
63*86d7f5d3SJohn Marino 
64*86d7f5d3SJohn Marino   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
65*86d7f5d3SJohn Marino        v0       v1      hi(vinf)       |vm1|     (v2-vm1)/3    EMPTY */
66*86d7f5d3SJohn Marino 
67*86d7f5d3SJohn Marino   /* (2) vm1 <- tm1 := (v1 - vm1) / 2  [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
68*86d7f5d3SJohn Marino      tm1 >= 0                                         (0  1 0  1 0)
69*86d7f5d3SJohn Marino      No carry comes out from {v1, kk1} +/- {vm1, kk1},
70*86d7f5d3SJohn Marino      and the division by two is exact.
71*86d7f5d3SJohn Marino      If (sa!=0) the sign of vm1 is negative */
72*86d7f5d3SJohn Marino   if (sa)
73*86d7f5d3SJohn Marino     {
74*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1add_n
75*86d7f5d3SJohn Marino       mpn_rsh1add_n (vm1, v1, vm1, kk1);
76*86d7f5d3SJohn Marino #else
77*86d7f5d3SJohn Marino       ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
78*86d7f5d3SJohn Marino       ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
79*86d7f5d3SJohn Marino #endif
80*86d7f5d3SJohn Marino     }
81*86d7f5d3SJohn Marino   else
82*86d7f5d3SJohn Marino     {
83*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1sub_n
84*86d7f5d3SJohn Marino       mpn_rsh1sub_n (vm1, v1, vm1, kk1);
85*86d7f5d3SJohn Marino #else
86*86d7f5d3SJohn Marino       ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
87*86d7f5d3SJohn Marino       ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
88*86d7f5d3SJohn Marino #endif
89*86d7f5d3SJohn Marino     }
90*86d7f5d3SJohn Marino 
91*86d7f5d3SJohn Marino   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
92*86d7f5d3SJohn Marino        v0       v1        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
93*86d7f5d3SJohn Marino 
94*86d7f5d3SJohn Marino   /* (3) v1 <- t1 := v1 - v0    (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
95*86d7f5d3SJohn Marino      t1 >= 0
96*86d7f5d3SJohn Marino   */
97*86d7f5d3SJohn Marino   vinf[0] -= mpn_sub_n (v1, v1, c, twok);
98*86d7f5d3SJohn Marino 
99*86d7f5d3SJohn Marino   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
100*86d7f5d3SJohn Marino        v0     v1-v0        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
101*86d7f5d3SJohn Marino 
102*86d7f5d3SJohn Marino   /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
103*86d7f5d3SJohn Marino      t2 >= 0                  [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
104*86d7f5d3SJohn Marino   */
105*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_rsh1sub_n
106*86d7f5d3SJohn Marino   mpn_rsh1sub_n (v2, v2, v1, kk1);
107*86d7f5d3SJohn Marino #else
108*86d7f5d3SJohn Marino   ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
109*86d7f5d3SJohn Marino   ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
110*86d7f5d3SJohn Marino #endif
111*86d7f5d3SJohn Marino 
112*86d7f5d3SJohn Marino   /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
113*86d7f5d3SJohn Marino        v0     v1-v0        hi(vinf)     tm1    (v2-vm1-3t1)/6    EMPTY */
114*86d7f5d3SJohn Marino 
115*86d7f5d3SJohn Marino   /* (5) v1 <- t1-tm1           (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
116*86d7f5d3SJohn Marino      result is v1 >= 0
117*86d7f5d3SJohn Marino   */
118*86d7f5d3SJohn Marino   ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
119*86d7f5d3SJohn Marino 
120*86d7f5d3SJohn Marino   /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
121*86d7f5d3SJohn Marino   cy = mpn_add_n (c1, c1, vm1, kk1);
122*86d7f5d3SJohn Marino   MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
123*86d7f5d3SJohn Marino   /* Memory allocated for vm1 is now free, it can be recycled ...*/
124*86d7f5d3SJohn Marino 
125*86d7f5d3SJohn Marino   /* (6) v2 <- v2 - 2*vinf,     (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
126*86d7f5d3SJohn Marino      result is v2 >= 0 */
127*86d7f5d3SJohn Marino   saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
128*86d7f5d3SJohn Marino   vinf[0] = vinf0;       /* Set the right value for vinf0                     */
129*86d7f5d3SJohn Marino #ifdef HAVE_NATIVE_mpn_sublsh1_n
130*86d7f5d3SJohn Marino   cy = mpn_sublsh1_n (v2, v2, vinf, twor);
131*86d7f5d3SJohn Marino #else
132*86d7f5d3SJohn Marino   /* Overwrite unused vm1 */
133*86d7f5d3SJohn Marino   cy = mpn_lshift (vm1, vinf, twor, 1);
134*86d7f5d3SJohn Marino   cy += mpn_sub_n (v2, v2, vm1, twor);
135*86d7f5d3SJohn Marino #endif
136*86d7f5d3SJohn Marino   MPN_DECR_U (v2 + twor, kk1 - twor, cy);
137*86d7f5d3SJohn Marino 
138*86d7f5d3SJohn Marino   /* Current matrix is
139*86d7f5d3SJohn Marino      [1 0 0 0 0; vinf
140*86d7f5d3SJohn Marino       0 1 0 0 0; v2
141*86d7f5d3SJohn Marino       1 0 1 0 0; v1
142*86d7f5d3SJohn Marino       0 1 0 1 0; vm1
143*86d7f5d3SJohn Marino       0 0 0 0 1] v0
144*86d7f5d3SJohn Marino      Some vaues already are in-place (we added vm1 in the correct position)
145*86d7f5d3SJohn Marino      | vinf|  v1 |  v0 |
146*86d7f5d3SJohn Marino 	      | vm1 |
147*86d7f5d3SJohn Marino      One still is in a separated area
148*86d7f5d3SJohn Marino 	| +v2 |
149*86d7f5d3SJohn Marino      We have to compute v1-=vinf; vm1 -= v2,
150*86d7f5d3SJohn Marino 	   |-vinf|
151*86d7f5d3SJohn Marino 	      | -v2 |
152*86d7f5d3SJohn Marino      Carefully reordering operations we can avoid to compute twice the sum
153*86d7f5d3SJohn Marino      of the high half of v2 plus the low half of vinf.
154*86d7f5d3SJohn Marino   */
155*86d7f5d3SJohn Marino 
156*86d7f5d3SJohn Marino   /* Add the high half of t2 in {vinf} */
157*86d7f5d3SJohn Marino   if ( LIKELY(twor > k + 1) ) { /* This is the expected flow  */
158*86d7f5d3SJohn Marino     cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
159*86d7f5d3SJohn Marino     MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
160*86d7f5d3SJohn Marino   } else { /* triggered only by very unbalanced cases like
161*86d7f5d3SJohn Marino 	      (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
162*86d7f5d3SJohn Marino     ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
163*86d7f5d3SJohn Marino   }
164*86d7f5d3SJohn Marino   /* (7) v1 <- v1 - vinf,       (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
165*86d7f5d3SJohn Marino      result is >= 0 */
166*86d7f5d3SJohn Marino   /* Side effect: we also subtracted (high half) vm1 -= v2 */
167*86d7f5d3SJohn Marino   cy = mpn_sub_n (v1, v1, vinf, twor);          /* vinf is at most twor long.  */
168*86d7f5d3SJohn Marino   vinf0 = vinf[0];                     /* Save again the right value for vinf0 */
169*86d7f5d3SJohn Marino   vinf[0] = saved;
170*86d7f5d3SJohn Marino   MPN_DECR_U (v1 + twor, kk1 - twor, cy);       /* Treat the last bytes.       */
171*86d7f5d3SJohn Marino 
172*86d7f5d3SJohn Marino   /* (8) vm1 <- vm1-v2          (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
173*86d7f5d3SJohn Marino      Operate only on the low half.
174*86d7f5d3SJohn Marino   */
175*86d7f5d3SJohn Marino   cy = mpn_sub_n (c1, c1, v2, k);
176*86d7f5d3SJohn Marino   MPN_DECR_U (v1, kk1, cy);
177*86d7f5d3SJohn Marino 
178*86d7f5d3SJohn Marino   /********************* Beginning the final phase **********************/
179*86d7f5d3SJohn Marino 
180*86d7f5d3SJohn Marino   /* Most of the recomposition was done */
181*86d7f5d3SJohn Marino 
182*86d7f5d3SJohn Marino   /* add t2 in {c+3k, ...}, but only the low half */
183*86d7f5d3SJohn Marino   cy = mpn_add_n (c3, c3, v2, k);
184*86d7f5d3SJohn Marino   vinf[0] += cy;
185*86d7f5d3SJohn Marino   ASSERT(vinf[0] >= cy); /* No carry */
186*86d7f5d3SJohn Marino   MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
187*86d7f5d3SJohn Marino 
188*86d7f5d3SJohn Marino #undef v0
189*86d7f5d3SJohn Marino }
190