186d7f5d3SJohn Marino /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino Contributed to the GNU project by Marco Bodrato.
486d7f5d3SJohn Marino
586d7f5d3SJohn Marino THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
686d7f5d3SJohn Marino SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
786d7f5d3SJohn Marino GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino Copyright 2009, 2010 Free Software Foundation, Inc.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino This file is part of the GNU MP Library.
1286d7f5d3SJohn Marino
1386d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1486d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1586d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1686d7f5d3SJohn Marino option) any later version.
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1986d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2086d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2186d7f5d3SJohn Marino License for more details.
2286d7f5d3SJohn Marino
2386d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2486d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2586d7f5d3SJohn Marino
2686d7f5d3SJohn Marino #include "gmp.h"
2786d7f5d3SJohn Marino #include "gmp-impl.h"
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
3086d7f5d3SJohn Marino #ifndef mpn_divexact_by3
3186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
3286d7f5d3SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
3386d7f5d3SJohn Marino #else
3486d7f5d3SJohn Marino #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
3586d7f5d3SJohn Marino #endif
3686d7f5d3SJohn Marino #endif
3786d7f5d3SJohn Marino
3886d7f5d3SJohn Marino /* Interpolation for Toom-3.5, using the evaluation points: infinity,
3986d7f5d3SJohn Marino 1, -1, 2, -2. More precisely, we want to compute
4086d7f5d3SJohn Marino f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
4186d7f5d3SJohn Marino six values
4286d7f5d3SJohn Marino
4386d7f5d3SJohn Marino w5 = f(0),
4486d7f5d3SJohn Marino w4 = f(-1),
4586d7f5d3SJohn Marino w3 = f(1)
4686d7f5d3SJohn Marino w2 = f(-2),
4786d7f5d3SJohn Marino w1 = f(2),
4886d7f5d3SJohn Marino w0 = limit at infinity of f(x) / x^5,
4986d7f5d3SJohn Marino
5086d7f5d3SJohn Marino The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
5186d7f5d3SJohn Marino {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
5286d7f5d3SJohn Marino {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
5386d7f5d3SJohn Marino significant limbs small). f(-1) and f(-2) may be negative, signs
5486d7f5d3SJohn Marino determined by the flag bits. All intermediate results are positive.
5586d7f5d3SJohn Marino Inputs are destroyed.
5686d7f5d3SJohn Marino
5786d7f5d3SJohn Marino Interpolation sequence was taken from the paper: "Integer and
5886d7f5d3SJohn Marino Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
5986d7f5d3SJohn Marino Some slight variations were introduced: adaptation to "gmp
6086d7f5d3SJohn Marino instruction set", and a final saving of an operation by interlacing
6186d7f5d3SJohn Marino interpolation and recomposition phases.
6286d7f5d3SJohn Marino */
6386d7f5d3SJohn Marino
6486d7f5d3SJohn Marino void
mpn_toom_interpolate_6pts(mp_ptr pp,mp_size_t n,enum toom6_flags flags,mp_ptr w4,mp_ptr w2,mp_ptr w1,mp_size_t w0n)6586d7f5d3SJohn Marino mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
6686d7f5d3SJohn Marino mp_ptr w4, mp_ptr w2, mp_ptr w1,
6786d7f5d3SJohn Marino mp_size_t w0n)
6886d7f5d3SJohn Marino {
6986d7f5d3SJohn Marino mp_limb_t cy;
7086d7f5d3SJohn Marino /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
7186d7f5d3SJohn Marino mp_limb_t cy4, cy6, embankment;
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino ASSERT( n > 0 );
7486d7f5d3SJohn Marino ASSERT( 2*n >= w0n && w0n > 0 );
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino #define w5 pp /* 2n */
7786d7f5d3SJohn Marino #define w3 (pp + 2 * n) /* 2n+1 */
7886d7f5d3SJohn Marino #define w0 (pp + 5 * n) /* w0n */
7986d7f5d3SJohn Marino
8086d7f5d3SJohn Marino /* Interpolate with sequence:
8186d7f5d3SJohn Marino W2 =(W1 - W2)>>2
8286d7f5d3SJohn Marino W1 =(W1 - W5)>>1
8386d7f5d3SJohn Marino W1 =(W1 - W2)>>1
8486d7f5d3SJohn Marino W4 =(W3 - W4)>>1
8586d7f5d3SJohn Marino W2 =(W2 - W4)/3
8686d7f5d3SJohn Marino W3 = W3 - W4 - W5
8786d7f5d3SJohn Marino W1 =(W1 - W3)/3
8886d7f5d3SJohn Marino // Last steps are mixed with recomposition...
8986d7f5d3SJohn Marino W2 = W2 - W0<<2
9086d7f5d3SJohn Marino W4 = W4 - W2
9186d7f5d3SJohn Marino W3 = W3 - W1
9286d7f5d3SJohn Marino W2 = W2 - W0
9386d7f5d3SJohn Marino */
9486d7f5d3SJohn Marino
9586d7f5d3SJohn Marino /* W2 =(W1 - W2)>>2 */
9686d7f5d3SJohn Marino if (flags & toom6_vm2_neg)
9786d7f5d3SJohn Marino mpn_add_n (w2, w1, w2, 2 * n + 1);
9886d7f5d3SJohn Marino else
9986d7f5d3SJohn Marino mpn_sub_n (w2, w1, w2, 2 * n + 1);
10086d7f5d3SJohn Marino mpn_rshift (w2, w2, 2 * n + 1, 2);
10186d7f5d3SJohn Marino
10286d7f5d3SJohn Marino /* W1 =(W1 - W5)>>1 */
10386d7f5d3SJohn Marino w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
10486d7f5d3SJohn Marino mpn_rshift (w1, w1, 2 * n + 1, 1);
10586d7f5d3SJohn Marino
10686d7f5d3SJohn Marino /* W1 =(W1 - W2)>>1 */
10786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1sub_n
10886d7f5d3SJohn Marino mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
10986d7f5d3SJohn Marino #else
11086d7f5d3SJohn Marino mpn_sub_n (w1, w1, w2, 2 * n + 1);
11186d7f5d3SJohn Marino mpn_rshift (w1, w1, 2 * n + 1, 1);
11286d7f5d3SJohn Marino #endif
11386d7f5d3SJohn Marino
11486d7f5d3SJohn Marino /* W4 =(W3 - W4)>>1 */
11586d7f5d3SJohn Marino if (flags & toom6_vm1_neg)
11686d7f5d3SJohn Marino {
11786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_n
11886d7f5d3SJohn Marino mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
11986d7f5d3SJohn Marino #else
12086d7f5d3SJohn Marino mpn_add_n (w4, w3, w4, 2 * n + 1);
12186d7f5d3SJohn Marino mpn_rshift (w4, w4, 2 * n + 1, 1);
12286d7f5d3SJohn Marino #endif
12386d7f5d3SJohn Marino }
12486d7f5d3SJohn Marino else
12586d7f5d3SJohn Marino {
12686d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1sub_n
12786d7f5d3SJohn Marino mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
12886d7f5d3SJohn Marino #else
12986d7f5d3SJohn Marino mpn_sub_n (w4, w3, w4, 2 * n + 1);
13086d7f5d3SJohn Marino mpn_rshift (w4, w4, 2 * n + 1, 1);
13186d7f5d3SJohn Marino #endif
13286d7f5d3SJohn Marino }
13386d7f5d3SJohn Marino
13486d7f5d3SJohn Marino /* W2 =(W2 - W4)/3 */
13586d7f5d3SJohn Marino mpn_sub_n (w2, w2, w4, 2 * n + 1);
13686d7f5d3SJohn Marino mpn_divexact_by3 (w2, w2, 2 * n + 1);
13786d7f5d3SJohn Marino
13886d7f5d3SJohn Marino /* W3 = W3 - W4 - W5 */
13986d7f5d3SJohn Marino mpn_sub_n (w3, w3, w4, 2 * n + 1);
14086d7f5d3SJohn Marino w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
14186d7f5d3SJohn Marino
14286d7f5d3SJohn Marino /* W1 =(W1 - W3)/3 */
14386d7f5d3SJohn Marino mpn_sub_n (w1, w1, w3, 2 * n + 1);
14486d7f5d3SJohn Marino mpn_divexact_by3 (w1, w1, 2 * n + 1);
14586d7f5d3SJohn Marino
14686d7f5d3SJohn Marino /*
14786d7f5d3SJohn Marino [1 0 0 0 0 0;
14886d7f5d3SJohn Marino 0 1 0 0 0 0;
14986d7f5d3SJohn Marino 1 0 1 0 0 0;
15086d7f5d3SJohn Marino 0 1 0 1 0 0;
15186d7f5d3SJohn Marino 1 0 1 0 1 0;
15286d7f5d3SJohn Marino 0 0 0 0 0 1]
15386d7f5d3SJohn Marino
15486d7f5d3SJohn Marino pp[] prior to operations:
15586d7f5d3SJohn Marino |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
15686d7f5d3SJohn Marino
15786d7f5d3SJohn Marino summation scheme for remaining operations:
15886d7f5d3SJohn Marino |______________5|n_____4|n_____3|n_____2|n______|n______|pp
15986d7f5d3SJohn Marino |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
16086d7f5d3SJohn Marino || H w4 | L w4 |
16186d7f5d3SJohn Marino || H w2 | L w2 |
16286d7f5d3SJohn Marino || H w1 | L w1 |
16386d7f5d3SJohn Marino ||-H w1 |-L w1 |
16486d7f5d3SJohn Marino |-H w0 |-L w0 ||-H w2 |-L w2 |
16586d7f5d3SJohn Marino */
16686d7f5d3SJohn Marino cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
16786d7f5d3SJohn Marino MPN_INCR_U (pp + 3 * n + 1, n, cy);
16886d7f5d3SJohn Marino
16986d7f5d3SJohn Marino /* W2 -= W0<<2 */
17086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n
17186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sublsh2_n
17286d7f5d3SJohn Marino cy = mpn_sublsh2_n(w2, w2, w0, w0n);
17386d7f5d3SJohn Marino #else
17486d7f5d3SJohn Marino cy = mpn_sublsh_n(w2, w2, w0, w0n, 2);
17586d7f5d3SJohn Marino #endif
17686d7f5d3SJohn Marino #else
17786d7f5d3SJohn Marino /* {W4,2*n+1} is now free and can be overwritten. */
17886d7f5d3SJohn Marino cy = mpn_lshift(w4, w0, w0n, 2);
17986d7f5d3SJohn Marino cy+= mpn_sub_n(w2, w2, w4, w0n);
18086d7f5d3SJohn Marino #endif
18186d7f5d3SJohn Marino MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
18286d7f5d3SJohn Marino
18386d7f5d3SJohn Marino /* W4L = W4L - W2L */
18486d7f5d3SJohn Marino cy = mpn_sub_n (pp + n, pp + n, w2, n);
18586d7f5d3SJohn Marino MPN_DECR_U (w3, 2 * n + 1, cy);
18686d7f5d3SJohn Marino
18786d7f5d3SJohn Marino /* W3H = W3H + W2L */
18886d7f5d3SJohn Marino cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
18986d7f5d3SJohn Marino /* W1L + W2H */
19086d7f5d3SJohn Marino cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
19186d7f5d3SJohn Marino MPN_INCR_U (w1 + n, n + 1, cy);
19286d7f5d3SJohn Marino
19386d7f5d3SJohn Marino /* W0 = W0 + W1H */
19486d7f5d3SJohn Marino if (LIKELY (w0n > n))
19586d7f5d3SJohn Marino cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
19686d7f5d3SJohn Marino else
19786d7f5d3SJohn Marino cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
19886d7f5d3SJohn Marino
19986d7f5d3SJohn Marino /*
20086d7f5d3SJohn Marino summation scheme for the next operation:
20186d7f5d3SJohn Marino |...____5|n_____4|n_____3|n_____2|n______|n______|pp
20286d7f5d3SJohn Marino |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
20386d7f5d3SJohn Marino ...-w0___|-w1_w2 |
20486d7f5d3SJohn Marino */
20586d7f5d3SJohn Marino /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
20686d7f5d3SJohn Marino cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
20786d7f5d3SJohn Marino
20886d7f5d3SJohn Marino /* embankment is a "dirty trick" to avoid carry/borrow propagation
20986d7f5d3SJohn Marino beyond allocated memory */
21086d7f5d3SJohn Marino embankment = w0[w0n - 1] - 1;
21186d7f5d3SJohn Marino w0[w0n - 1] = 1;
21286d7f5d3SJohn Marino if (LIKELY (w0n > n)) {
21386d7f5d3SJohn Marino if ( LIKELY(cy4 > cy6) )
21486d7f5d3SJohn Marino MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
21586d7f5d3SJohn Marino else
21686d7f5d3SJohn Marino MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
21786d7f5d3SJohn Marino MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
21886d7f5d3SJohn Marino MPN_INCR_U (w0 + n, w0n - n, cy6);
21986d7f5d3SJohn Marino } else {
22086d7f5d3SJohn Marino MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
22186d7f5d3SJohn Marino MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
22286d7f5d3SJohn Marino }
22386d7f5d3SJohn Marino w0[w0n - 1] += embankment;
22486d7f5d3SJohn Marino
22586d7f5d3SJohn Marino #undef w5
22686d7f5d3SJohn Marino #undef w3
22786d7f5d3SJohn Marino #undef w0
22886d7f5d3SJohn Marino
22986d7f5d3SJohn Marino }
230