1 /* mpn_toom42_mulmid -- toom42 middle product 2 3 Contributed by David Harvey. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2011 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26 or both in parallel, as here. 27 28 The GNU MP Library is distributed in the hope that it will be useful, but 29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31 for more details. 32 33 You should have received copies of the GNU General Public License and the 34 GNU Lesser General Public License along with the GNU MP Library. If not, 35 see https://www.gnu.org/licenses/. */ 36 37 38 #include "gmp-impl.h" 39 40 41 42 /* 43 Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}. 44 45 Neither ap nor bp may overlap rp. 46 47 Must have n >= 4. 48 49 Amount of scratch space required is given by mpn_toom42_mulmid_itch(). 50 51 FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact 52 requirements should be clarified. 53 */ 54 void 55 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, 56 mp_ptr scratch) 57 { 58 mp_limb_t cy, e[12], zh, zl; 59 mp_size_t m; 60 int neg; 61 62 ASSERT (n >= 4); 63 ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); 64 ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); 65 66 ap += n & 1; /* handle odd row and diagonal later */ 67 m = n / 2; 68 69 /* (e0h:e0l) etc are correction terms, in 2's complement */ 70 #define e0l (e[0]) 71 #define e0h (e[1]) 72 #define e1l (e[2]) 73 #define e1h (e[3]) 74 #define e2l (e[4]) 75 #define e2h (e[5]) 76 #define e3l (e[6]) 77 #define e3h (e[7]) 78 #define e4l (e[8]) 79 #define e4h (e[9]) 80 #define e5l (e[10]) 81 #define e5h (e[11]) 82 83 #define s (scratch + 2) 84 #define t (rp + m + 2) 85 #define p0 rp 86 #define p1 scratch 87 #define p2 (rp + m) 88 #define next_scratch (scratch + 3*m + 1) 89 90 /* 91 rp scratch 92 |---------|-----------| |---------|---------|----------| 93 0 m 2m+2 0 m 2m 3m+1 94 <----p2----> <-------------s-------------> 95 <----p0----><---t----> <----p1----> 96 */ 97 98 /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */ 99 cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0); 100 cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l, 101 bp + m, bp, m, cy); 102 mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy); 103 104 /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */ 105 if (mpn_cmp (bp + m, bp, m) < 0) 106 { 107 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l, 108 ap + m - 1, ap + 2*m - 1, m, 0)); 109 neg = 1; 110 } 111 else 112 { 113 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l, 114 ap + m - 1, ap + 2*m - 1, m, 0)); 115 neg = 0; 116 } 117 118 /* recursive middle products. The picture is: 119 120 b[2m-1] A A A B B B - - - - - 121 ... - A A A B B B - - - - 122 b[m] - - A A A B B B - - - 123 b[m-1] - - - C C C D D D - - 124 ... - - - - C C C D D D - 125 b[0] - - - - - C C C D D D 126 a[0] ... a[m] ... a[2m] ... a[4m-2] 127 */ 128 129 if (m < MULMID_TOOM42_THRESHOLD) 130 { 131 /* A + B */ 132 mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m); 133 /* accumulate high limbs of p0 into e1 */ 134 ADDC_LIMB (cy, e1l, e1l, p0[m]); 135 e1h += p0[m + 1] + cy; 136 /* (-1)^neg * (B - C) (overwrites first m limbs of s) */ 137 mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m); 138 /* C + D (overwrites t) */ 139 mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m); 140 } 141 else 142 { 143 /* as above, but use toom42 instead */ 144 mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch); 145 ADDC_LIMB (cy, e1l, e1l, p0[m]); 146 e1h += p0[m + 1] + cy; 147 mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch); 148 mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch); 149 } 150 151 /* apply error terms */ 152 153 /* -e0 at rp[0] */ 154 SUBC_LIMB (cy, rp[0], rp[0], e0l); 155 SUBC_LIMB (cy, rp[1], rp[1], e0h + cy); 156 if (UNLIKELY (cy)) 157 { 158 cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1; 159 SUBC_LIMB (cy, e1l, e1l, cy); 160 e1h -= cy; 161 } 162 163 /* z = e1 - e2 + high(p0) */ 164 SUBC_LIMB (cy, zl, e1l, e2l); 165 zh = e1h - e2h - cy; 166 167 /* z at rp[m] */ 168 ADDC_LIMB (cy, rp[m], rp[m], zl); 169 zh = (zh + cy) & GMP_NUMB_MASK; 170 ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh); 171 cy -= (zh >> (GMP_NUMB_BITS - 1)); 172 if (UNLIKELY (cy)) 173 { 174 if (cy == 1) 175 mpn_add_1 (rp + m + 2, rp + m + 2, m, 1); 176 else /* cy == -1 */ 177 mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1); 178 } 179 180 /* e3 at rp[2*m] */ 181 ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l); 182 rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK; 183 184 /* e4 at p1[0] */ 185 ADDC_LIMB (cy, p1[0], p1[0], e4l); 186 ADDC_LIMB (cy, p1[1], p1[1], e4h + cy); 187 if (UNLIKELY (cy)) 188 mpn_add_1 (p1 + 2, p1 + 2, m, 1); 189 190 /* -e5 at p1[m] */ 191 SUBC_LIMB (cy, p1[m], p1[m], e5l); 192 p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK; 193 194 /* adjustment if p1 ends up negative */ 195 cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1)); 196 197 /* add (-1)^neg * (p1 - B^m * p1) to output */ 198 if (neg) 199 { 200 mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy); 201 mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ 202 mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */ 203 } 204 else 205 { 206 mpn_add_1 (rp + m + 2, rp + m + 2, m, cy); 207 mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ 208 mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */ 209 } 210 211 /* odd row and diagonal */ 212 if (n & 1) 213 { 214 /* 215 Products marked E are already done. We need to do products marked O. 216 217 OOOOO---- 218 -EEEEO--- 219 --EEEEO-- 220 ---EEEEO- 221 ----EEEEO 222 */ 223 224 /* first row of O's */ 225 cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]); 226 ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy); 227 228 /* O's on diagonal */ 229 /* FIXME: should probably define an interface "mpn_mulmid_diag_1" 230 that can handle the sum below. Currently we're relying on 231 mulmid_basecase being pretty fast for a diagonal sum like this, 232 which is true at least for the K8 asm version, but surely false 233 for the generic version. */ 234 mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1); 235 mpn_add_n (rp + n - 1, rp + n - 1, e, 3); 236 } 237 } 238