xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/sqrlo.c (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1 /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
2    of the result.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 
6    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
7    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
8    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 
41 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
42 #define MAYBE_range_basecase 1
43 #define MAYBE_range_toom22   1
44 #else
45 #define MAYBE_range_basecase                                           \
46   ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
47 #define MAYBE_range_toom22                                             \
48   ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
49 #endif
50 
51 /*  THINK: The DC strategy uses different constants in different Toom's
52 	 ranges. Something smoother?
53 */
54 
55 /*
56   Compute the least significant half of the product {xy,n}*{yp,n}, or
57   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
58 
59   Above the given threshold, the Divide and Conquer strategy is used.
60   The operand is split in two, and a full square plus a mullo
61   is used to obtain the final result. The more natural strategy is to
62   split in two halves, but this is far from optimal when a
63   sub-quadratic multiplication is used.
64 
65   Mulders suggests an unbalanced split in favour of the full product,
66   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
67 
68   To compute the value of a, we assume that the cost of mullo for a
69   given size ML(n) is a fraction of the cost of a full product with
70   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
71   then we can write:
72 
73   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
74 
75   Given a value for e, want to minimise the value of k, i.e. the
76   function k=(1-a)^e/(1-2*a^e).
77 
78   With e=2, the exponent for schoolbook multiplication, the minimum is
79   given by the values a=1-a=1/2.
80 
81   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
82   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
83 
84   Other possible approximations follow:
85   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
86   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
87   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
88   e=log(15)/log(8) [Toom-8] -> a ~= 1/10
89 
90   The values above where obtained with the following trivial commands
91   in the gp-pari shell:
92 
93 fun(e,a)=(1-a)^e/(1-2*a^e)
94 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
95 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
96 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
97 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
98 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
99 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
100 
101   ,
102   |\
103   | \
104   +----,
105   |    |
106   |    |
107   |    |\
108   |    | \
109   +----+--`
110   ^ n2 ^n1^
111 
112   For an actual implementation, the assumption that M(n)=n^e is
113   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
114   with a constant k is wrong.
115 
116   But theory suggest us two things:
117   - the best the multiplication product is (lower e), the more k
118     approaches 1, and a approaches 0.
119 
120   - A value for a smaller than optimal is probably less bad than a
121     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
122     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
123     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
124 */
125 
126 static mp_size_t
127 mpn_sqrlo_itch (mp_size_t n)
128 {
129   return 2*n;
130 }
131 
132 /*
133     mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
134     It accepts tp == rp.
135 */
136 static void
137 mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
138 {
139   mp_size_t n2, n1;
140   ASSERT (n >= 2);
141   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
142   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
143 
144   /* Divide-and-conquer */
145 
146   /* We need fractional approximation of the value 0 < a <= 1/2
147      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
148   */
149   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
150     n1 = n >> 1;
151   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
152     n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
153   else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
154     n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
155   else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
156     n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
157   /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
158   else
159     n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
160 
161   n2 = n - n1;
162 
163   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
164 
165   /* x0 ^ 2 */
166   mpn_sqr (tp, xp, n2);
167   MPN_COPY (rp, tp, n2);
168 
169   /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
170   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
171     mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
172   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
173     mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
174   else
175     mpn_mullo_n (tp + n, xp + n2, xp, n1);
176   /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
177 #if HAVE_NATIVE_mpn_addlsh1_n
178   mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
179 #else
180   mpn_lshift (rp + n2, tp + n, n1, 1);
181   mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
182 #endif
183 }
184 
185 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
186 #define SQR_BASECASE_ALLOC \
187  (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
188 
189 /* FIXME: This function should accept a temporary area; dc_sqrlo
190    accepts a pointer tp, and handle the case tp == rp, do the same here.
191 */
192 
193 void
194 mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
195 {
196   ASSERT (n >= 1);
197   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
198 
199   if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
200     {
201       /* FIXME: smarter criteria? */
202 #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
203       /* mullo computes as many products as sqr, but directly writes
204 	 on the result area. */
205       mpn_mullo_basecase (rp, xp, xp, n);
206 #else
207       /* Allocate workspace of fixed size on stack: fast! */
208       mp_limb_t tp[SQR_BASECASE_ALLOC];
209       mpn_sqr_basecase (tp, xp, n);
210       MPN_COPY (rp, tp, n);
211 #endif
212     }
213   else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
214     {
215       mpn_sqrlo_basecase (rp, xp, n);
216     }
217   else
218     {
219       mp_ptr tp;
220       TMP_DECL;
221       TMP_MARK;
222       tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
223       if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
224 	{
225 	  mpn_dc_sqrlo (rp, xp, n, tp);
226 	}
227       else
228 	{
229 	  /* For really large operands, use plain mpn_mul_n but throw away upper n
230 	     limbs of result.  */
231 #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
232 	  mpn_fft_mul (tp, xp, n, xp, n);
233 #else
234 	  mpn_sqr (tp, xp, n);
235 #endif
236 	  MPN_COPY (rp, tp, n);
237 	}
238       TMP_FREE;
239     }
240 }
241