xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/toom42_mulmid.c (revision 501cd18a74d52bfcca7d9e7e3b0d472bbc870558)
1 /* mpn_toom42_mulmid -- toom42 middle product
2 
3    Contributed by David Harvey.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2011 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17 
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21 License for more details.
22 
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25 
26 
27 #include "gmp.h"
28 #include "gmp-impl.h"
29 
30 
31 
32 /*
33   Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
34 
35   Neither ap nor bp may overlap rp.
36 
37   Must have n >= 4.
38 
39   Amount of scratch space required is given by mpn_toom42_mulmid_itch().
40 
41   FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
42   requirements should be clarified.
43 */
44 void
45 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
46                    mp_ptr scratch)
47 {
48   mp_limb_t cy, e[12], zh, zl;
49   mp_size_t m;
50   int neg;
51 
52   ASSERT (n >= 4);
53   ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
54   ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
55 
56   ap += n & 1;   /* handle odd row and diagonal later */
57   m = n / 2;
58 
59   /* (e0h:e0l) etc are correction terms, in 2's complement */
60 #define e0l (e[0])
61 #define e0h (e[1])
62 #define e1l (e[2])
63 #define e1h (e[3])
64 #define e2l (e[4])
65 #define e2h (e[5])
66 #define e3l (e[6])
67 #define e3h (e[7])
68 #define e4l (e[8])
69 #define e4h (e[9])
70 #define e5l (e[10])
71 #define e5h (e[11])
72 
73 #define s (scratch + 2)
74 #define t (rp + m + 2)
75 #define p0 rp
76 #define p1 scratch
77 #define p2 (rp + m)
78 #define next_scratch (scratch + 3*m + 1)
79 
80   /*
81             rp                            scratch
82   |---------|-----------|    |---------|---------|----------|
83   0         m         2m+2   0         m         2m        3m+1
84             <----p2---->       <-------------s------------->
85   <----p0----><---t---->     <----p1---->
86   */
87 
88   /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
89   cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
90   cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
91 		       bp + m, bp, m, cy);
92   mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
93 
94   /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
95   if (mpn_cmp (bp + m, bp, m) < 0)
96     {
97       ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
98 				      ap + m - 1, ap + 2*m - 1, m, 0));
99       neg = 1;
100     }
101   else
102     {
103       ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
104 				      ap + m - 1, ap + 2*m - 1, m, 0));
105       neg = 0;
106     }
107 
108   /* recursive middle products. The picture is:
109 
110       b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
111       ...       -   A   A   A   B   B   B   -   -   -   -
112       b[m]      -   -   A   A   A   B   B   B   -   -   -
113       b[m-1]    -   -   -   C   C   C   D   D   D   -   -
114       ...       -   -   -   -   C   C   C   D   D   D   -
115       b[0]      -   -   -   -   -   C   C   C   D   D   D
116                a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
117   */
118 
119   if (m < MULMID_TOOM42_THRESHOLD)
120     {
121       /* A + B */
122       mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
123       /* accumulate high limbs of p0 into e1 */
124       ADDC_LIMB (cy, e1l, e1l, p0[m]);
125       e1h += p0[m + 1] + cy;
126       /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
127       mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
128       /* C + D   (overwrites t) */
129       mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
130     }
131   else
132     {
133       /* as above, but use toom42 instead */
134       mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
135       ADDC_LIMB (cy, e1l, e1l, p0[m]);
136       e1h += p0[m + 1] + cy;
137       mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
138       mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
139     }
140 
141   /* apply error terms */
142 
143   /* -e0 at rp[0] */
144   SUBC_LIMB (cy, rp[0], rp[0], e0l);
145   SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
146   if (UNLIKELY (cy))
147     {
148       cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
149       SUBC_LIMB (cy, e1l, e1l, cy);
150       e1h -= cy;
151     }
152 
153   /* z = e1 - e2 + high(p0) */
154   SUBC_LIMB (cy, zl, e1l, e2l);
155   zh = e1h - e2h - cy;
156 
157   /* z at rp[m] */
158   ADDC_LIMB (cy, rp[m], rp[m], zl);
159   zh = (zh + cy) & GMP_NUMB_MASK;
160   ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
161   cy -= (zh >> (GMP_NUMB_BITS - 1));
162   if (UNLIKELY (cy))
163     {
164       if (cy == 1)
165 	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
166       else /* cy == -1 */
167 	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
168     }
169 
170   /* e3 at rp[2*m] */
171   ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
172   rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
173 
174   /* e4 at p1[0] */
175   ADDC_LIMB (cy, p1[0], p1[0], e4l);
176   ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
177   if (UNLIKELY (cy))
178     mpn_add_1 (p1 + 2, p1 + 2, m, 1);
179 
180   /* -e5 at p1[m] */
181   SUBC_LIMB (cy, p1[m], p1[m], e5l);
182   p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
183 
184   /* adjustment if p1 ends up negative */
185   cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
186 
187   /* add (-1)^neg * (p1 - B^m * p1) to output */
188   if (neg)
189     {
190       mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
191       mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
192       mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
193     }
194   else
195     {
196       mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
197       mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
198       mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
199     }
200 
201   /* odd row and diagonal */
202   if (n & 1)
203     {
204       /*
205         Products marked E are already done. We need to do products marked O.
206 
207         OOOOO----
208         -EEEEO---
209         --EEEEO--
210         ---EEEEO-
211         ----EEEEO
212        */
213 
214       /* first row of O's */
215       cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
216       ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
217 
218       /* O's on diagonal */
219       /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
220          that can handle the sum below. Currently we're relying on
221          mulmid_basecase being pretty fast for a diagonal sum like this,
222 	 which is true at least for the K8 asm verion, but surely false
223 	 for the generic version. */
224       mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
225       mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
226     }
227 }
228