1 /* mpn_toom42_mulmid -- toom42 middle product 2 3 Contributed by David Harvey. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2011 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 27 #include "gmp.h" 28 #include "gmp-impl.h" 29 30 31 32 /* 33 Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}. 34 35 Neither ap nor bp may overlap rp. 36 37 Must have n >= 4. 38 39 Amount of scratch space required is given by mpn_toom42_mulmid_itch(). 40 41 FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact 42 requirements should be clarified. 43 */ 44 void 45 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, 46 mp_ptr scratch) 47 { 48 mp_limb_t cy, e[12], zh, zl; 49 mp_size_t m; 50 int neg; 51 52 ASSERT (n >= 4); 53 ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); 54 ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); 55 56 ap += n & 1; /* handle odd row and diagonal later */ 57 m = n / 2; 58 59 /* (e0h:e0l) etc are correction terms, in 2's complement */ 60 #define e0l (e[0]) 61 #define e0h (e[1]) 62 #define e1l (e[2]) 63 #define e1h (e[3]) 64 #define e2l (e[4]) 65 #define e2h (e[5]) 66 #define e3l (e[6]) 67 #define e3h (e[7]) 68 #define e4l (e[8]) 69 #define e4h (e[9]) 70 #define e5l (e[10]) 71 #define e5h (e[11]) 72 73 #define s (scratch + 2) 74 #define t (rp + m + 2) 75 #define p0 rp 76 #define p1 scratch 77 #define p2 (rp + m) 78 #define next_scratch (scratch + 3*m + 1) 79 80 /* 81 rp scratch 82 |---------|-----------| |---------|---------|----------| 83 0 m 2m+2 0 m 2m 3m+1 84 <----p2----> <-------------s-------------> 85 <----p0----><---t----> <----p1----> 86 */ 87 88 /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */ 89 cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0); 90 cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l, 91 bp + m, bp, m, cy); 92 mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy); 93 94 /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */ 95 if (mpn_cmp (bp + m, bp, m) < 0) 96 { 97 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l, 98 ap + m - 1, ap + 2*m - 1, m, 0)); 99 neg = 1; 100 } 101 else 102 { 103 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l, 104 ap + m - 1, ap + 2*m - 1, m, 0)); 105 neg = 0; 106 } 107 108 /* recursive middle products. The picture is: 109 110 b[2m-1] A A A B B B - - - - - 111 ... - A A A B B B - - - - 112 b[m] - - A A A B B B - - - 113 b[m-1] - - - C C C D D D - - 114 ... - - - - C C C D D D - 115 b[0] - - - - - C C C D D D 116 a[0] ... a[m] ... a[2m] ... a[4m-2] 117 */ 118 119 if (m < MULMID_TOOM42_THRESHOLD) 120 { 121 /* A + B */ 122 mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m); 123 /* accumulate high limbs of p0 into e1 */ 124 ADDC_LIMB (cy, e1l, e1l, p0[m]); 125 e1h += p0[m + 1] + cy; 126 /* (-1)^neg * (B - C) (overwrites first m limbs of s) */ 127 mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m); 128 /* C + D (overwrites t) */ 129 mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m); 130 } 131 else 132 { 133 /* as above, but use toom42 instead */ 134 mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch); 135 ADDC_LIMB (cy, e1l, e1l, p0[m]); 136 e1h += p0[m + 1] + cy; 137 mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch); 138 mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch); 139 } 140 141 /* apply error terms */ 142 143 /* -e0 at rp[0] */ 144 SUBC_LIMB (cy, rp[0], rp[0], e0l); 145 SUBC_LIMB (cy, rp[1], rp[1], e0h + cy); 146 if (UNLIKELY (cy)) 147 { 148 cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1; 149 SUBC_LIMB (cy, e1l, e1l, cy); 150 e1h -= cy; 151 } 152 153 /* z = e1 - e2 + high(p0) */ 154 SUBC_LIMB (cy, zl, e1l, e2l); 155 zh = e1h - e2h - cy; 156 157 /* z at rp[m] */ 158 ADDC_LIMB (cy, rp[m], rp[m], zl); 159 zh = (zh + cy) & GMP_NUMB_MASK; 160 ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh); 161 cy -= (zh >> (GMP_NUMB_BITS - 1)); 162 if (UNLIKELY (cy)) 163 { 164 if (cy == 1) 165 mpn_add_1 (rp + m + 2, rp + m + 2, m, 1); 166 else /* cy == -1 */ 167 mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1); 168 } 169 170 /* e3 at rp[2*m] */ 171 ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l); 172 rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK; 173 174 /* e4 at p1[0] */ 175 ADDC_LIMB (cy, p1[0], p1[0], e4l); 176 ADDC_LIMB (cy, p1[1], p1[1], e4h + cy); 177 if (UNLIKELY (cy)) 178 mpn_add_1 (p1 + 2, p1 + 2, m, 1); 179 180 /* -e5 at p1[m] */ 181 SUBC_LIMB (cy, p1[m], p1[m], e5l); 182 p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK; 183 184 /* adjustment if p1 ends up negative */ 185 cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1)); 186 187 /* add (-1)^neg * (p1 - B^m * p1) to output */ 188 if (neg) 189 { 190 mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy); 191 mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ 192 mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */ 193 } 194 else 195 { 196 mpn_add_1 (rp + m + 2, rp + m + 2, m, cy); 197 mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ 198 mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */ 199 } 200 201 /* odd row and diagonal */ 202 if (n & 1) 203 { 204 /* 205 Products marked E are already done. We need to do products marked O. 206 207 OOOOO---- 208 -EEEEO--- 209 --EEEEO-- 210 ---EEEEO- 211 ----EEEEO 212 */ 213 214 /* first row of O's */ 215 cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]); 216 ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy); 217 218 /* O's on diagonal */ 219 /* FIXME: should probably define an interface "mpn_mulmid_diag_1" 220 that can handle the sum below. Currently we're relying on 221 mulmid_basecase being pretty fast for a diagonal sum like this, 222 which is true at least for the K8 asm verion, but surely false 223 for the generic version. */ 224 mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1); 225 mpn_add_n (rp + n - 1, rp + n - 1, e, 3); 226 } 227 } 228