xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/toom3_sqr.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_toom3_sqr -- Square {ap,an}.
2 
3    Contributed to the GNU project by Torbjorn Granlund.
4    Additional improvements by Marco Bodrato.
5 
6    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2006-2010, 2012, 2015 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 
39 #include "gmp-impl.h"
40 
41 /* Evaluate in: -1, 0, +1, +2, +inf
42 
43   <-s--><--n--><--n-->
44    ____ ______ ______
45   |_a2_|___a1_|___a0_|
46 
47   v0  =  a0         ^2 #   A(0)^2
48   v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
49   vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
50   v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
51   vinf=          a2 ^2 # A(inf)^2
52 */
53 
54 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
55 #define MAYBE_sqr_basecase 1
56 #define MAYBE_sqr_toom3   1
57 #else
58 #define MAYBE_sqr_basecase						\
59   (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
60 #define MAYBE_sqr_toom3							\
61   (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
62 #endif
63 
64 #define TOOM3_SQR_REC(p, a, n, ws)					\
65   do {									\
66     if (MAYBE_sqr_basecase						\
67 	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
68       mpn_sqr_basecase (p, a, n);					\
69     else if (! MAYBE_sqr_toom3						\
70 	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
71       mpn_toom2_sqr (p, a, n, ws);					\
72     else								\
73       mpn_toom3_sqr (p, a, n, ws);					\
74   } while (0)
75 
76 void
mpn_toom3_sqr(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_ptr scratch)77 mpn_toom3_sqr (mp_ptr pp,
78 	       mp_srcptr ap, mp_size_t an,
79 	       mp_ptr scratch)
80 {
81   const int __gmpn_cpuvec_initialized = 1;
82   mp_size_t n, s;
83   mp_limb_t cy, vinf0;
84   mp_ptr gp;
85   mp_ptr as1, asm1, as2;
86 
87 #define a0  ap
88 #define a1  (ap + n)
89 #define a2  (ap + 2*n)
90 
91   n = (an + 2) / (size_t) 3;
92 
93   s = an - 2 * n;
94 
95   ASSERT (0 < s && s <= n);
96 
97   as1 = scratch + 4 * n + 4;
98   asm1 = scratch + 2 * n + 2;
99   as2 = pp + n + 1;
100 
101   gp = scratch;
102 
103   /* Compute as1 and asm1.  */
104   cy = mpn_add (gp, a0, n, a2, s);
105 #if HAVE_NATIVE_mpn_add_n_sub_n
106   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
107     {
108       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
109       as1[n] = cy >> 1;
110       asm1[n] = 0;
111     }
112   else
113     {
114       mp_limb_t cy2;
115       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
116       as1[n] = cy + (cy2 >> 1);
117       asm1[n] = cy - (cy2 & 1);
118     }
119 #else
120   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
121   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
122     {
123       mpn_sub_n (asm1, a1, gp, n);
124       asm1[n] = 0;
125     }
126   else
127     {
128       cy -= mpn_sub_n (asm1, gp, a1, n);
129       asm1[n] = cy;
130     }
131 #endif
132 
133   /* Compute as2.  */
134 #if HAVE_NATIVE_mpn_rsblsh1_n
135   cy = mpn_add_n (as2, a2, as1, s);
136   if (s != n)
137     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
138   cy += as1[n];
139   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
140 #else
141 #if HAVE_NATIVE_mpn_addlsh1_n
142   cy  = mpn_addlsh1_n (as2, a1, a2, s);
143   if (s != n)
144     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
145   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
146 #else
147   cy = mpn_add_n (as2, a2, as1, s);
148   if (s != n)
149     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
150   cy += as1[n];
151   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
152   cy -= mpn_sub_n (as2, as2, a0, n);
153 #endif
154 #endif
155   as2[n] = cy;
156 
157   ASSERT (as1[n] <= 2);
158   ASSERT (asm1[n] <= 1);
159 
160 #define v0    pp				/* 2n */
161 #define v1    (pp + 2 * n)			/* 2n+1 */
162 #define vinf  (pp + 4 * n)			/* s+s */
163 #define vm1   scratch				/* 2n+1 */
164 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
165 #define scratch_out  (scratch + 5 * n + 5)
166 
167   /* vm1, 2n+1 limbs */
168 #ifdef SMALLER_RECURSION
169   TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
170   cy = 0;
171   if (asm1[n] != 0)
172     cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
173   if (asm1[n] != 0)
174     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
175   vm1[2 * n] = cy;
176 #else
177   TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
178 #endif
179 
180   TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
181 
182   TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
183 
184   vinf0 = vinf[0];				/* v1 overlaps with this */
185 
186 #ifdef SMALLER_RECURSION
187   /* v1, 2n+1 limbs */
188   TOOM3_SQR_REC (v1, as1, n, scratch_out);
189   if (as1[n] == 1)
190     {
191       cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
192     }
193   else if (as1[n] != 0)
194     {
195 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
196       cy = 2 * as1[n] + mpn_addlsh1_n_ip1 (v1 + n, as1, n);
197 #else
198       cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
199 #endif
200     }
201   else
202     cy = 0;
203   if (as1[n] == 1)
204     {
205       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
206     }
207   else if (as1[n] != 0)
208     {
209 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
210       cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
211 #else
212       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
213 #endif
214     }
215   v1[2 * n] = cy;
216 #else
217   cy = vinf[1];
218   TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
219   vinf[1] = cy;
220 #endif
221 
222   TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
223 
224   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
225 }
226