1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn. 2 storing the result in {qp,nn}. Overlap allowed between Q and N; all other 3 overlap disallowed. 4 5 Contributed to the GNU project by Torbjorn Granlund. 6 7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. 10 11 Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of either: 17 18 * the GNU Lesser General Public License as published by the Free 19 Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 or 23 24 * the GNU General Public License as published by the Free Software 25 Foundation; either version 2 of the License, or (at your option) any 26 later version. 27 28 or both in parallel, as here. 29 30 The GNU MP Library is distributed in the hope that it will be useful, but 31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 33 for more details. 34 35 You should have received copies of the GNU General Public License and the 36 GNU Lesser General Public License along with the GNU MP Library. If not, 37 see https://www.gnu.org/licenses/. */ 38 39 40 /* 41 The idea of the algorithm used herein is to compute a smaller inverted value 42 than used in the standard Barrett algorithm, and thus save time in the 43 Newton iterations, and pay just a small price when using the inverted value 44 for developing quotient bits. This algorithm was presented at ICMS 2006. 45 */ 46 47 #include "gmp-impl.h" 48 49 50 /* N = {np,nn} 51 D = {dp,dn} 52 53 Requirements: N >= D 54 D >= 1 55 D odd 56 dn >= 2 57 nn >= 2 58 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn). 59 60 Write quotient to Q = {qp,nn}. 61 62 FIXME: When iterating, perhaps do the small step before loop, not after. 63 FIXME: Try to avoid the scalar divisions when computing inverse size. 64 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In 65 particular, when dn==in, tp and rp could use the same space. 66 FIXME: Trim final quotient calculation to qn limbs of precision. 67 */ 68 static void 69 mpn_mu_bdiv_q_old (mp_ptr qp, 70 mp_srcptr np, mp_size_t nn, 71 mp_srcptr dp, mp_size_t dn, 72 mp_ptr scratch) 73 { 74 mp_size_t qn; 75 mp_size_t in; 76 int cy, c0; 77 mp_size_t tn, wn; 78 79 qn = nn; 80 81 ASSERT (dn >= 2); 82 ASSERT (qn >= 2); 83 84 if (qn > dn) 85 { 86 mp_size_t b; 87 88 /* |_______________________| dividend 89 |________| divisor */ 90 91 #define ip scratch /* in */ 92 #define rp (scratch + in) /* dn or rest >= binvert_itch(in) */ 93 #define tp (scratch + in + dn) /* dn+in or next_size(dn) */ 94 #define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */ 95 96 /* Compute an inverse size that is a nice partition of the quotient. */ 97 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 98 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 99 100 /* Some notes on allocation: 101 102 When in = dn, R dies when mpn_mullo returns, if in < dn the low in 103 limbs of R dies at that point. We could save memory by letting T live 104 just under R, and let the upper part of T expand into R. These changes 105 should reduce itch to perhaps 3dn. 106 */ 107 108 mpn_binvert (ip, dp, in, rp); 109 110 cy = 0; 111 112 MPN_COPY (rp, np, dn); 113 np += dn; 114 mpn_mullo_n (qp, rp, ip, in); 115 qn -= in; 116 117 while (qn > in) 118 { 119 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 120 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ 121 else 122 { 123 tn = mpn_mulmod_bnm1_next_size (dn); 124 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 125 wn = dn + in - tn; /* number of wrapped limbs */ 126 if (wn > 0) 127 { 128 c0 = mpn_sub_n (tp + tn, tp, rp, wn); 129 mpn_decr_u (tp + wn, c0); 130 } 131 } 132 133 qp += in; 134 if (dn != in) 135 { 136 /* Subtract tp[dn-1...in] from partial remainder. */ 137 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); 138 if (cy == 2) 139 { 140 mpn_incr_u (tp + dn, 1); 141 cy = 1; 142 } 143 } 144 /* Subtract tp[dn+in-1...dn] from dividend. */ 145 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); 146 np += in; 147 mpn_mullo_n (qp, rp, ip, in); 148 qn -= in; 149 } 150 151 /* Generate last qn limbs. 152 FIXME: It should be possible to limit precision here, since qn is 153 typically somewhat smaller than dn. No big gains expected. */ 154 155 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 156 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */ 157 else 158 { 159 tn = mpn_mulmod_bnm1_next_size (dn); 160 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); 161 wn = dn + in - tn; /* number of wrapped limbs */ 162 if (wn > 0) 163 { 164 c0 = mpn_sub_n (tp + tn, tp, rp, wn); 165 mpn_decr_u (tp + wn, c0); 166 } 167 } 168 169 qp += in; 170 if (dn != in) 171 { 172 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); 173 if (cy == 2) 174 { 175 mpn_incr_u (tp + dn, 1); 176 cy = 1; 177 } 178 } 179 180 mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy); 181 mpn_mullo_n (qp, rp, ip, qn); 182 183 #undef ip 184 #undef rp 185 #undef tp 186 #undef scratch_out 187 } 188 else 189 { 190 /* |_______________________| dividend 191 |________________| divisor */ 192 193 #define ip scratch /* in */ 194 #define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */ 195 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */ 196 197 /* Compute half-sized inverse. */ 198 in = qn - (qn >> 1); 199 200 mpn_binvert (ip, dp, in, tp); 201 202 mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ 203 204 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 205 mpn_mul (tp, dp, qn, qp, in); /* mulhigh */ 206 else 207 { 208 tn = mpn_mulmod_bnm1_next_size (qn); 209 mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out); 210 wn = qn + in - tn; /* number of wrapped limbs */ 211 if (wn > 0) 212 { 213 c0 = mpn_cmp (tp, np, wn) < 0; 214 mpn_decr_u (tp + wn, c0); 215 } 216 } 217 218 mpn_sub_n (tp, np + in, tp + in, qn - in); 219 mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */ 220 221 #undef ip 222 #undef tp 223 #undef scratch_out 224 } 225 } 226 227 void 228 mpn_mu_bdiv_q (mp_ptr qp, 229 mp_srcptr np, mp_size_t nn, 230 mp_srcptr dp, mp_size_t dn, 231 mp_ptr scratch) 232 { 233 mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch); 234 mpn_neg (qp, qp, nn); 235 } 236 237 mp_size_t 238 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn) 239 { 240 mp_size_t qn, in, tn, itch_binvert, itch_out, itches; 241 mp_size_t b; 242 243 ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); 244 245 qn = nn; 246 247 if (qn > dn) 248 { 249 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 250 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 251 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 252 { 253 tn = dn + in; 254 itch_out = 0; 255 } 256 else 257 { 258 tn = mpn_mulmod_bnm1_next_size (dn); 259 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); 260 } 261 itches = dn + tn + itch_out; 262 } 263 else 264 { 265 in = qn - (qn >> 1); 266 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) 267 { 268 tn = qn + in; 269 itch_out = 0; 270 } 271 else 272 { 273 tn = mpn_mulmod_bnm1_next_size (qn); 274 itch_out = mpn_mulmod_bnm1_itch (tn, qn, in); 275 } 276 itches = tn + itch_out; 277 } 278 279 itch_binvert = mpn_binvert_itch (in); 280 return in + MAX (itches, itch_binvert); 281 } 282