xref: /dflybsd-src/contrib/gmp/mpn/generic/sqr_basecase.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_sqr_basecase -- Internal routine to square a natural number
286d7f5d3SJohn Marino    of length n.
386d7f5d3SJohn Marino 
486d7f5d3SJohn Marino    THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
586d7f5d3SJohn Marino    SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
686d7f5d3SJohn Marino 
786d7f5d3SJohn Marino 
886d7f5d3SJohn Marino Copyright 1991, 1992, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2003, 2004,
986d7f5d3SJohn Marino 2005, 2008 Free Software Foundation, Inc.
1086d7f5d3SJohn Marino 
1186d7f5d3SJohn Marino This file is part of the GNU MP Library.
1286d7f5d3SJohn Marino 
1386d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1486d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1586d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1686d7f5d3SJohn Marino option) any later version.
1786d7f5d3SJohn Marino 
1886d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1986d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2086d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2186d7f5d3SJohn Marino License for more details.
2286d7f5d3SJohn Marino 
2386d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2486d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2586d7f5d3SJohn Marino 
2686d7f5d3SJohn Marino #include "gmp.h"
2786d7f5d3SJohn Marino #include "gmp-impl.h"
2886d7f5d3SJohn Marino #include "longlong.h"
2986d7f5d3SJohn Marino 
3086d7f5d3SJohn Marino 
3186d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_sqr_diagonal
3286d7f5d3SJohn Marino #define MPN_SQR_DIAGONAL(rp, up, n)					\
3386d7f5d3SJohn Marino   mpn_sqr_diagonal (rp, up, n)
3486d7f5d3SJohn Marino #else
3586d7f5d3SJohn Marino #define MPN_SQR_DIAGONAL(rp, up, n)					\
3686d7f5d3SJohn Marino   do {									\
3786d7f5d3SJohn Marino     mp_size_t _i;							\
3886d7f5d3SJohn Marino     for (_i = 0; _i < (n); _i++)					\
3986d7f5d3SJohn Marino       {									\
4086d7f5d3SJohn Marino 	mp_limb_t ul, lpl;						\
4186d7f5d3SJohn Marino 	ul = (up)[_i];							\
4286d7f5d3SJohn Marino 	umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS);	\
4386d7f5d3SJohn Marino 	(rp)[2 * _i] = lpl >> GMP_NAIL_BITS;				\
4486d7f5d3SJohn Marino       }									\
4586d7f5d3SJohn Marino   } while (0)
4686d7f5d3SJohn Marino #endif
4786d7f5d3SJohn Marino 
4886d7f5d3SJohn Marino 
4986d7f5d3SJohn Marino #undef READY_WITH_mpn_sqr_basecase
5086d7f5d3SJohn Marino 
5186d7f5d3SJohn Marino 
5286d7f5d3SJohn Marino #if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2s
5386d7f5d3SJohn Marino void
mpn_sqr_basecase(mp_ptr rp,mp_srcptr up,mp_size_t n)5486d7f5d3SJohn Marino mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
5586d7f5d3SJohn Marino {
5686d7f5d3SJohn Marino   mp_size_t i;
5786d7f5d3SJohn Marino   mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
5886d7f5d3SJohn Marino   mp_ptr tp = tarr;
5986d7f5d3SJohn Marino   mp_limb_t cy;
6086d7f5d3SJohn Marino 
6186d7f5d3SJohn Marino   /* must fit 2*n limbs in tarr */
6286d7f5d3SJohn Marino   ASSERT (n <= SQR_TOOM2_THRESHOLD);
6386d7f5d3SJohn Marino 
6486d7f5d3SJohn Marino   if ((n & 1) != 0)
6586d7f5d3SJohn Marino     {
6686d7f5d3SJohn Marino       if (n == 1)
6786d7f5d3SJohn Marino 	{
6886d7f5d3SJohn Marino 	  mp_limb_t ul, lpl;
6986d7f5d3SJohn Marino 	  ul = up[0];
7086d7f5d3SJohn Marino 	  umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
7186d7f5d3SJohn Marino 	  rp[0] = lpl >> GMP_NAIL_BITS;
7286d7f5d3SJohn Marino 	  return;
7386d7f5d3SJohn Marino 	}
7486d7f5d3SJohn Marino 
7586d7f5d3SJohn Marino       MPN_ZERO (tp, n);
7686d7f5d3SJohn Marino 
7786d7f5d3SJohn Marino       for (i = 0; i <= n - 2; i += 2)
7886d7f5d3SJohn Marino 	{
7986d7f5d3SJohn Marino 	  cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
8086d7f5d3SJohn Marino 	  tp[n + i] = cy;
8186d7f5d3SJohn Marino 	}
8286d7f5d3SJohn Marino     }
8386d7f5d3SJohn Marino   else
8486d7f5d3SJohn Marino     {
8586d7f5d3SJohn Marino       if (n == 2)
8686d7f5d3SJohn Marino 	{
8786d7f5d3SJohn Marino 	  rp[0] = 0;
8886d7f5d3SJohn Marino 	  rp[1] = 0;
8986d7f5d3SJohn Marino 	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
9086d7f5d3SJohn Marino 	  return;
9186d7f5d3SJohn Marino 	}
9286d7f5d3SJohn Marino 
9386d7f5d3SJohn Marino       MPN_ZERO (tp, n);
9486d7f5d3SJohn Marino 
9586d7f5d3SJohn Marino       for (i = 0; i <= n - 4; i += 2)
9686d7f5d3SJohn Marino 	{
9786d7f5d3SJohn Marino 	  cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
9886d7f5d3SJohn Marino 	  tp[n + i] = cy;
9986d7f5d3SJohn Marino 	}
10086d7f5d3SJohn Marino       cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]);
10186d7f5d3SJohn Marino       tp[2 * n - 3] = cy;
10286d7f5d3SJohn Marino     }
10386d7f5d3SJohn Marino 
10486d7f5d3SJohn Marino   MPN_SQR_DIAGONAL (rp, up, n);
10586d7f5d3SJohn Marino 
10686d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
10786d7f5d3SJohn Marino   cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
10886d7f5d3SJohn Marino #else
10986d7f5d3SJohn Marino   cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
11086d7f5d3SJohn Marino   cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
11186d7f5d3SJohn Marino #endif
11286d7f5d3SJohn Marino   rp[2 * n - 1] += cy;
11386d7f5d3SJohn Marino }
11486d7f5d3SJohn Marino #define READY_WITH_mpn_sqr_basecase
11586d7f5d3SJohn Marino #endif
11686d7f5d3SJohn Marino 
11786d7f5d3SJohn Marino 
11886d7f5d3SJohn Marino #if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2
11986d7f5d3SJohn Marino 
12086d7f5d3SJohn Marino /* mpn_sqr_basecase using plain mpn_addmul_2.
12186d7f5d3SJohn Marino 
12286d7f5d3SJohn Marino    This is tricky, since we have to let mpn_addmul_2 make some undesirable
12386d7f5d3SJohn Marino    multiplies, u[k]*u[k], that we would like to let mpn_sqr_diagonal handle.
12486d7f5d3SJohn Marino    This forces us to conditionally add or subtract the mpn_sqr_diagonal
12586d7f5d3SJohn Marino    results.  Examples of the product we form:
12686d7f5d3SJohn Marino 
12786d7f5d3SJohn Marino    n = 4              n = 5		n = 6
12886d7f5d3SJohn Marino    u1u0 * u3u2u1      u1u0 * u4u3u2u1	u1u0 * u5u4u3u2u1
12986d7f5d3SJohn Marino    u2 * u3	      u3u2 * u4u3	u3u2 * u5u4u3
13086d7f5d3SJohn Marino 					u4 * u5
13186d7f5d3SJohn Marino    add: u0 u2 u3      add: u0 u2 u4	add: u0 u2 u4 u5
13286d7f5d3SJohn Marino    sub: u1	      sub: u1 u3	sub: u1 u3
13386d7f5d3SJohn Marino */
13486d7f5d3SJohn Marino 
13586d7f5d3SJohn Marino void
mpn_sqr_basecase(mp_ptr rp,mp_srcptr up,mp_size_t n)13686d7f5d3SJohn Marino mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
13786d7f5d3SJohn Marino {
13886d7f5d3SJohn Marino   mp_size_t i;
13986d7f5d3SJohn Marino   mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
14086d7f5d3SJohn Marino   mp_ptr tp = tarr;
14186d7f5d3SJohn Marino   mp_limb_t cy;
14286d7f5d3SJohn Marino 
14386d7f5d3SJohn Marino   /* must fit 2*n limbs in tarr */
14486d7f5d3SJohn Marino   ASSERT (n <= SQR_TOOM2_THRESHOLD);
14586d7f5d3SJohn Marino 
14686d7f5d3SJohn Marino   if ((n & 1) != 0)
14786d7f5d3SJohn Marino     {
14886d7f5d3SJohn Marino       mp_limb_t x0, x1;
14986d7f5d3SJohn Marino 
15086d7f5d3SJohn Marino       if (n == 1)
15186d7f5d3SJohn Marino 	{
15286d7f5d3SJohn Marino 	  mp_limb_t ul, lpl;
15386d7f5d3SJohn Marino 	  ul = up[0];
15486d7f5d3SJohn Marino 	  umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
15586d7f5d3SJohn Marino 	  rp[0] = lpl >> GMP_NAIL_BITS;
15686d7f5d3SJohn Marino 	  return;
15786d7f5d3SJohn Marino 	}
15886d7f5d3SJohn Marino 
15986d7f5d3SJohn Marino       /* The code below doesn't like unnormalized operands.  Since such
16086d7f5d3SJohn Marino 	 operands are unusual, handle them with a dumb recursion.  */
16186d7f5d3SJohn Marino       if (up[n - 1] == 0)
16286d7f5d3SJohn Marino 	{
16386d7f5d3SJohn Marino 	  rp[2 * n - 2] = 0;
16486d7f5d3SJohn Marino 	  rp[2 * n - 1] = 0;
16586d7f5d3SJohn Marino 	  mpn_sqr_basecase (rp, up, n - 1);
16686d7f5d3SJohn Marino 	  return;
16786d7f5d3SJohn Marino 	}
16886d7f5d3SJohn Marino 
16986d7f5d3SJohn Marino       MPN_ZERO (tp, n);
17086d7f5d3SJohn Marino 
17186d7f5d3SJohn Marino       for (i = 0; i <= n - 2; i += 2)
17286d7f5d3SJohn Marino 	{
17386d7f5d3SJohn Marino 	  cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
17486d7f5d3SJohn Marino 	  tp[n + i] = cy;
17586d7f5d3SJohn Marino 	}
17686d7f5d3SJohn Marino 
17786d7f5d3SJohn Marino       MPN_SQR_DIAGONAL (rp, up, n);
17886d7f5d3SJohn Marino 
17986d7f5d3SJohn Marino       for (i = 2;; i += 4)
18086d7f5d3SJohn Marino 	{
18186d7f5d3SJohn Marino 	  x0 = rp[i + 0];
18286d7f5d3SJohn Marino 	  rp[i + 0] = (-x0) & GMP_NUMB_MASK;
18386d7f5d3SJohn Marino 	  x1 = rp[i + 1];
18486d7f5d3SJohn Marino 	  rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK;
18586d7f5d3SJohn Marino 	  __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0);
18686d7f5d3SJohn Marino 	  if (i + 4 >= 2 * n)
18786d7f5d3SJohn Marino 	    break;
18886d7f5d3SJohn Marino 	  mpn_incr_u (rp + i + 4, cy);
18986d7f5d3SJohn Marino 	}
19086d7f5d3SJohn Marino     }
19186d7f5d3SJohn Marino   else
19286d7f5d3SJohn Marino     {
19386d7f5d3SJohn Marino       mp_limb_t x0, x1;
19486d7f5d3SJohn Marino 
19586d7f5d3SJohn Marino       if (n == 2)
19686d7f5d3SJohn Marino 	{
19786d7f5d3SJohn Marino 	  rp[0] = 0;
19886d7f5d3SJohn Marino 	  rp[1] = 0;
19986d7f5d3SJohn Marino 	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
20086d7f5d3SJohn Marino 	  return;
20186d7f5d3SJohn Marino 	}
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino       /* The code below doesn't like unnormalized operands.  Since such
20486d7f5d3SJohn Marino 	 operands are unusual, handle them with a dumb recursion.  */
20586d7f5d3SJohn Marino       if (up[n - 1] == 0)
20686d7f5d3SJohn Marino 	{
20786d7f5d3SJohn Marino 	  rp[2 * n - 2] = 0;
20886d7f5d3SJohn Marino 	  rp[2 * n - 1] = 0;
20986d7f5d3SJohn Marino 	  mpn_sqr_basecase (rp, up, n - 1);
21086d7f5d3SJohn Marino 	  return;
21186d7f5d3SJohn Marino 	}
21286d7f5d3SJohn Marino 
21386d7f5d3SJohn Marino       MPN_ZERO (tp, n);
21486d7f5d3SJohn Marino 
21586d7f5d3SJohn Marino       for (i = 0; i <= n - 4; i += 2)
21686d7f5d3SJohn Marino 	{
21786d7f5d3SJohn Marino 	  cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
21886d7f5d3SJohn Marino 	  tp[n + i] = cy;
21986d7f5d3SJohn Marino 	}
22086d7f5d3SJohn Marino       cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]);
22186d7f5d3SJohn Marino       tp[2 * n - 3] = cy;
22286d7f5d3SJohn Marino 
22386d7f5d3SJohn Marino       MPN_SQR_DIAGONAL (rp, up, n);
22486d7f5d3SJohn Marino 
22586d7f5d3SJohn Marino       for (i = 2;; i += 4)
22686d7f5d3SJohn Marino 	{
22786d7f5d3SJohn Marino 	  x0 = rp[i + 0];
22886d7f5d3SJohn Marino 	  rp[i + 0] = (-x0) & GMP_NUMB_MASK;
22986d7f5d3SJohn Marino 	  x1 = rp[i + 1];
23086d7f5d3SJohn Marino 	  rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK;
23186d7f5d3SJohn Marino 	  if (i + 6 >= 2 * n)
23286d7f5d3SJohn Marino 	    break;
23386d7f5d3SJohn Marino 	  __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0);
23486d7f5d3SJohn Marino 	  mpn_incr_u (rp + i + 4, cy);
23586d7f5d3SJohn Marino 	}
23686d7f5d3SJohn Marino       mpn_decr_u (rp + i + 2, (x1 | x0) != 0);
23786d7f5d3SJohn Marino     }
23886d7f5d3SJohn Marino 
23986d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
24086d7f5d3SJohn Marino   cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
24186d7f5d3SJohn Marino #else
24286d7f5d3SJohn Marino   cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
24386d7f5d3SJohn Marino   cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
24486d7f5d3SJohn Marino #endif
24586d7f5d3SJohn Marino   rp[2 * n - 1] += cy;
24686d7f5d3SJohn Marino }
24786d7f5d3SJohn Marino #define READY_WITH_mpn_sqr_basecase
24886d7f5d3SJohn Marino #endif
24986d7f5d3SJohn Marino 
25086d7f5d3SJohn Marino 
25186d7f5d3SJohn Marino #if ! defined (READY_WITH_mpn_sqr_basecase)
25286d7f5d3SJohn Marino 
25386d7f5d3SJohn Marino /* Default mpn_sqr_basecase using mpn_addmul_1.  */
25486d7f5d3SJohn Marino 
25586d7f5d3SJohn Marino void
mpn_sqr_basecase(mp_ptr rp,mp_srcptr up,mp_size_t n)25686d7f5d3SJohn Marino mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
25786d7f5d3SJohn Marino {
25886d7f5d3SJohn Marino   mp_size_t i;
25986d7f5d3SJohn Marino 
26086d7f5d3SJohn Marino   ASSERT (n >= 1);
26186d7f5d3SJohn Marino   ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));
26286d7f5d3SJohn Marino 
26386d7f5d3SJohn Marino   {
26486d7f5d3SJohn Marino     mp_limb_t ul, lpl;
26586d7f5d3SJohn Marino     ul = up[0];
26686d7f5d3SJohn Marino     umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
26786d7f5d3SJohn Marino     rp[0] = lpl >> GMP_NAIL_BITS;
26886d7f5d3SJohn Marino   }
26986d7f5d3SJohn Marino   if (n > 1)
27086d7f5d3SJohn Marino     {
27186d7f5d3SJohn Marino       mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
27286d7f5d3SJohn Marino       mp_ptr tp = tarr;
27386d7f5d3SJohn Marino       mp_limb_t cy;
27486d7f5d3SJohn Marino 
27586d7f5d3SJohn Marino       /* must fit 2*n limbs in tarr */
27686d7f5d3SJohn Marino       ASSERT (n <= SQR_TOOM2_THRESHOLD);
27786d7f5d3SJohn Marino 
27886d7f5d3SJohn Marino       cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
27986d7f5d3SJohn Marino       tp[n - 1] = cy;
28086d7f5d3SJohn Marino       for (i = 2; i < n; i++)
28186d7f5d3SJohn Marino 	{
28286d7f5d3SJohn Marino 	  mp_limb_t cy;
28386d7f5d3SJohn Marino 	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
28486d7f5d3SJohn Marino 	  tp[n + i - 2] = cy;
28586d7f5d3SJohn Marino 	}
28686d7f5d3SJohn Marino       MPN_SQR_DIAGONAL (rp + 2, up + 1, n - 1);
28786d7f5d3SJohn Marino 
28886d7f5d3SJohn Marino       {
28986d7f5d3SJohn Marino 	mp_limb_t cy;
29086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_addlsh1_n
29186d7f5d3SJohn Marino 	cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
29286d7f5d3SJohn Marino #else
29386d7f5d3SJohn Marino 	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
29486d7f5d3SJohn Marino 	cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
29586d7f5d3SJohn Marino #endif
29686d7f5d3SJohn Marino 	rp[2 * n - 1] += cy;
29786d7f5d3SJohn Marino       }
29886d7f5d3SJohn Marino     }
29986d7f5d3SJohn Marino }
30086d7f5d3SJohn Marino #endif
301