1 /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
2
3 Contributed to the GNU project by Marco Bodrato.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2009, 2010 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21 License for more details.
22
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25
26 #include "gmp.h"
27 #include "gmp-impl.h"
28
29 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
30 #ifndef mpn_divexact_by3
31 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
32 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
33 #else
34 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
35 #endif
36 #endif
37
38 /* Interpolation for Toom-3.5, using the evaluation points: infinity,
39 1, -1, 2, -2. More precisely, we want to compute
40 f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
41 six values
42
43 w5 = f(0),
44 w4 = f(-1),
45 w3 = f(1)
46 w2 = f(-2),
47 w1 = f(2),
48 w0 = limit at infinity of f(x) / x^5,
49
50 The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
51 {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
52 {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
53 significant limbs small). f(-1) and f(-2) may be negative, signs
54 determined by the flag bits. All intermediate results are positive.
55 Inputs are destroyed.
56
57 Interpolation sequence was taken from the paper: "Integer and
58 Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
59 Some slight variations were introduced: adaptation to "gmp
60 instruction set", and a final saving of an operation by interlacing
61 interpolation and recomposition phases.
62 */
63
64 void
mpn_toom_interpolate_6pts(mp_ptr pp,mp_size_t n,enum toom6_flags flags,mp_ptr w4,mp_ptr w2,mp_ptr w1,mp_size_t w0n)65 mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
66 mp_ptr w4, mp_ptr w2, mp_ptr w1,
67 mp_size_t w0n)
68 {
69 mp_limb_t cy;
70 /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
71 mp_limb_t cy4, cy6, embankment;
72
73 ASSERT( n > 0 );
74 ASSERT( 2*n >= w0n && w0n > 0 );
75
76 #define w5 pp /* 2n */
77 #define w3 (pp + 2 * n) /* 2n+1 */
78 #define w0 (pp + 5 * n) /* w0n */
79
80 /* Interpolate with sequence:
81 W2 =(W1 - W2)>>2
82 W1 =(W1 - W5)>>1
83 W1 =(W1 - W2)>>1
84 W4 =(W3 - W4)>>1
85 W2 =(W2 - W4)/3
86 W3 = W3 - W4 - W5
87 W1 =(W1 - W3)/3
88 // Last steps are mixed with recomposition...
89 W2 = W2 - W0<<2
90 W4 = W4 - W2
91 W3 = W3 - W1
92 W2 = W2 - W0
93 */
94
95 /* W2 =(W1 - W2)>>2 */
96 if (flags & toom6_vm2_neg)
97 mpn_add_n (w2, w1, w2, 2 * n + 1);
98 else
99 mpn_sub_n (w2, w1, w2, 2 * n + 1);
100 mpn_rshift (w2, w2, 2 * n + 1, 2);
101
102 /* W1 =(W1 - W5)>>1 */
103 w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
104 mpn_rshift (w1, w1, 2 * n + 1, 1);
105
106 /* W1 =(W1 - W2)>>1 */
107 #if HAVE_NATIVE_mpn_rsh1sub_n
108 mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
109 #else
110 mpn_sub_n (w1, w1, w2, 2 * n + 1);
111 mpn_rshift (w1, w1, 2 * n + 1, 1);
112 #endif
113
114 /* W4 =(W3 - W4)>>1 */
115 if (flags & toom6_vm1_neg)
116 {
117 #if HAVE_NATIVE_mpn_rsh1add_n
118 mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
119 #else
120 mpn_add_n (w4, w3, w4, 2 * n + 1);
121 mpn_rshift (w4, w4, 2 * n + 1, 1);
122 #endif
123 }
124 else
125 {
126 #if HAVE_NATIVE_mpn_rsh1sub_n
127 mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
128 #else
129 mpn_sub_n (w4, w3, w4, 2 * n + 1);
130 mpn_rshift (w4, w4, 2 * n + 1, 1);
131 #endif
132 }
133
134 /* W2 =(W2 - W4)/3 */
135 mpn_sub_n (w2, w2, w4, 2 * n + 1);
136 mpn_divexact_by3 (w2, w2, 2 * n + 1);
137
138 /* W3 = W3 - W4 - W5 */
139 mpn_sub_n (w3, w3, w4, 2 * n + 1);
140 w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
141
142 /* W1 =(W1 - W3)/3 */
143 mpn_sub_n (w1, w1, w3, 2 * n + 1);
144 mpn_divexact_by3 (w1, w1, 2 * n + 1);
145
146 /*
147 [1 0 0 0 0 0;
148 0 1 0 0 0 0;
149 1 0 1 0 0 0;
150 0 1 0 1 0 0;
151 1 0 1 0 1 0;
152 0 0 0 0 0 1]
153
154 pp[] prior to operations:
155 |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
156
157 summation scheme for remaining operations:
158 |______________5|n_____4|n_____3|n_____2|n______|n______|pp
159 |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
160 || H w4 | L w4 |
161 || H w2 | L w2 |
162 || H w1 | L w1 |
163 ||-H w1 |-L w1 |
164 |-H w0 |-L w0 ||-H w2 |-L w2 |
165 */
166 cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
167 MPN_INCR_U (pp + 3 * n + 1, n, cy);
168
169 /* W2 -= W0<<2 */
170 #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n
171 #if HAVE_NATIVE_mpn_sublsh2_n
172 cy = mpn_sublsh2_n(w2, w2, w0, w0n);
173 #else
174 cy = mpn_sublsh_n(w2, w2, w0, w0n, 2);
175 #endif
176 #else
177 /* {W4,2*n+1} is now free and can be overwritten. */
178 cy = mpn_lshift(w4, w0, w0n, 2);
179 cy+= mpn_sub_n(w2, w2, w4, w0n);
180 #endif
181 MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
182
183 /* W4L = W4L - W2L */
184 cy = mpn_sub_n (pp + n, pp + n, w2, n);
185 MPN_DECR_U (w3, 2 * n + 1, cy);
186
187 /* W3H = W3H + W2L */
188 cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
189 /* W1L + W2H */
190 cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
191 MPN_INCR_U (w1 + n, n + 1, cy);
192
193 /* W0 = W0 + W1H */
194 if (LIKELY (w0n > n))
195 cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
196 else
197 cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
198
199 /*
200 summation scheme for the next operation:
201 |...____5|n_____4|n_____3|n_____2|n______|n______|pp
202 |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
203 ...-w0___|-w1_w2 |
204 */
205 /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
206 cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
207
208 /* embankment is a "dirty trick" to avoid carry/borrow propagation
209 beyond allocated memory */
210 embankment = w0[w0n - 1] - 1;
211 w0[w0n - 1] = 1;
212 if (LIKELY (w0n > n)) {
213 if ( LIKELY(cy4 > cy6) )
214 MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
215 else
216 MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
217 MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
218 MPN_INCR_U (w0 + n, w0n - n, cy6);
219 } else {
220 MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
221 MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
222 }
223 w0[w0n - 1] += embankment;
224
225 #undef w5
226 #undef w3
227 #undef w0
228
229 }
230