1 /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52 2 3 Contributed to the GNU project by Marco Bodrato. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26 or both in parallel, as here. 27 28 The GNU MP Library is distributed in the hope that it will be useful, but 29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 for more details. 32 33 You should have received copies of the GNU General Public License and the 34 GNU Lesser General Public License along with the GNU MP Library. If not, 35 see https://www.gnu.org/licenses/. */ 36 37 #include "gmp-impl.h" 38 39 #define BINVERT_3 MODLIMB_INVERSE_3 40 41 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */ 42 #ifndef mpn_divexact_by3 43 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 44 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) 45 #else 46 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) 47 #endif 48 #endif 49 50 /* Interpolation for Toom-3.5, using the evaluation points: infinity, 51 1, -1, 2, -2. More precisely, we want to compute 52 f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the 53 six values 54 55 w5 = f(0), 56 w4 = f(-1), 57 w3 = f(1) 58 w2 = f(-2), 59 w1 = f(2), 60 w0 = limit at infinity of f(x) / x^5, 61 62 The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at 63 {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at 64 {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most 65 significant limbs small). f(-1) and f(-2) may be negative, signs 66 determined by the flag bits. All intermediate results are positive. 67 Inputs are destroyed. 68 69 Interpolation sequence was taken from the paper: "Integer and 70 Polynomial Multiplication: Towards Optimal Toom-Cook Matrices". 71 Some slight variations were introduced: adaptation to "gmp 72 instruction set", and a final saving of an operation by interlacing 73 interpolation and recomposition phases. 74 */ 75 76 void 77 mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags, 78 mp_ptr w4, mp_ptr w2, mp_ptr w1, 79 mp_size_t w0n) 80 { 81 mp_limb_t cy; 82 /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */ 83 mp_limb_t cy4, cy6, embankment; 84 85 ASSERT( n > 0 ); 86 ASSERT( 2*n >= w0n && w0n > 0 ); 87 88 #define w5 pp /* 2n */ 89 #define w3 (pp + 2 * n) /* 2n+1 */ 90 #define w0 (pp + 5 * n) /* w0n */ 91 92 /* Interpolate with sequence: 93 W2 =(W1 - W2)>>2 94 W1 =(W1 - W5)>>1 95 W1 =(W1 - W2)>>1 96 W4 =(W3 - W4)>>1 97 W2 =(W2 - W4)/3 98 W3 = W3 - W4 - W5 99 W1 =(W1 - W3)/3 100 // Last steps are mixed with recomposition... 101 W2 = W2 - W0<<2 102 W4 = W4 - W2 103 W3 = W3 - W1 104 W2 = W2 - W0 105 */ 106 107 /* W2 =(W1 - W2)>>2 */ 108 if (flags & toom6_vm2_neg) 109 mpn_add_n (w2, w1, w2, 2 * n + 1); 110 else 111 mpn_sub_n (w2, w1, w2, 2 * n + 1); 112 mpn_rshift (w2, w2, 2 * n + 1, 2); 113 114 /* W1 =(W1 - W5)>>1 */ 115 w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n); 116 mpn_rshift (w1, w1, 2 * n + 1, 1); 117 118 /* W1 =(W1 - W2)>>1 */ 119 #if HAVE_NATIVE_mpn_rsh1sub_n 120 mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1); 121 #else 122 mpn_sub_n (w1, w1, w2, 2 * n + 1); 123 mpn_rshift (w1, w1, 2 * n + 1, 1); 124 #endif 125 126 /* W4 =(W3 - W4)>>1 */ 127 if (flags & toom6_vm1_neg) 128 { 129 #if HAVE_NATIVE_mpn_rsh1add_n 130 mpn_rsh1add_n (w4, w3, w4, 2 * n + 1); 131 #else 132 mpn_add_n (w4, w3, w4, 2 * n + 1); 133 mpn_rshift (w4, w4, 2 * n + 1, 1); 134 #endif 135 } 136 else 137 { 138 #if HAVE_NATIVE_mpn_rsh1sub_n 139 mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1); 140 #else 141 mpn_sub_n (w4, w3, w4, 2 * n + 1); 142 mpn_rshift (w4, w4, 2 * n + 1, 1); 143 #endif 144 } 145 146 /* W2 =(W2 - W4)/3 */ 147 mpn_sub_n (w2, w2, w4, 2 * n + 1); 148 mpn_divexact_by3 (w2, w2, 2 * n + 1); 149 150 /* W3 = W3 - W4 - W5 */ 151 mpn_sub_n (w3, w3, w4, 2 * n + 1); 152 w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n); 153 154 /* W1 =(W1 - W3)/3 */ 155 mpn_sub_n (w1, w1, w3, 2 * n + 1); 156 mpn_divexact_by3 (w1, w1, 2 * n + 1); 157 158 /* 159 [1 0 0 0 0 0; 160 0 1 0 0 0 0; 161 1 0 1 0 0 0; 162 0 1 0 1 0 0; 163 1 0 1 0 1 0; 164 0 0 0 0 0 1] 165 166 pp[] prior to operations: 167 |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| 168 169 summation scheme for remaining operations: 170 |______________5|n_____4|n_____3|n_____2|n______|n______|pp 171 |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| 172 || H w4 | L w4 | 173 || H w2 | L w2 | 174 || H w1 | L w1 | 175 ||-H w1 |-L w1 | 176 |-H w0 |-L w0 ||-H w2 |-L w2 | 177 */ 178 cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1); 179 MPN_INCR_U (pp + 3 * n + 1, n, cy); 180 181 /* W2 -= W0<<2 */ 182 #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1 183 #if HAVE_NATIVE_mpn_sublsh2_n_ip1 184 cy = mpn_sublsh2_n_ip1 (w2, w0, w0n); 185 #else 186 cy = mpn_sublsh_n (w2, w2, w0, w0n, 2); 187 #endif 188 #else 189 /* {W4,2*n+1} is now free and can be overwritten. */ 190 cy = mpn_lshift(w4, w0, w0n, 2); 191 cy+= mpn_sub_n(w2, w2, w4, w0n); 192 #endif 193 MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy); 194 195 /* W4L = W4L - W2L */ 196 cy = mpn_sub_n (pp + n, pp + n, w2, n); 197 MPN_DECR_U (w3, 2 * n + 1, cy); 198 199 /* W3H = W3H + W2L */ 200 cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n); 201 /* W1L + W2H */ 202 cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n); 203 MPN_INCR_U (w1 + n, n + 1, cy); 204 205 /* W0 = W0 + W1H */ 206 if (LIKELY (w0n > n)) 207 cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n); 208 else 209 cy6 = mpn_add_n (w0, w0, w1 + n, w0n); 210 211 /* 212 summation scheme for the next operation: 213 |...____5|n_____4|n_____3|n_____2|n______|n______|pp 214 |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__| 215 ...-w0___|-w1_w2 | 216 */ 217 /* if(LIKELY(w0n>n)) the two operands below DO overlap! */ 218 cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); 219 220 /* embankment is a "dirty trick" to avoid carry/borrow propagation 221 beyond allocated memory */ 222 embankment = w0[w0n - 1] - 1; 223 w0[w0n - 1] = 1; 224 if (LIKELY (w0n > n)) { 225 if (cy4 > cy6) 226 MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6); 227 else 228 MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4); 229 MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy); 230 MPN_INCR_U (w0 + n, w0n - n, cy6); 231 } else { 232 MPN_INCR_U (pp + 4 * n, w0n + n, cy4); 233 MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6); 234 } 235 w0[w0n - 1] += embankment; 236 237 #undef w5 238 #undef w3 239 #undef w0 240 241 } 242