xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/divrem_1.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* UltraSparc 64 mpn_divrem_1 -- mpn by limb division.
2 
3 Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation,
4 Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 #include "gmp-impl.h"
33 #include "longlong.h"
34 
35 #include "mpn/sparc64/sparc64.h"
36 
37 
38 /*                   64-bit divisor       32-bit divisor
39                        cycles/limb          cycles/limb
40                         (approx)             (approx)
41                    integer  fraction    integer  fraction
42    Ultrasparc 2i:    160      160          122      96
43 */
44 
45 
46 /* 32-bit divisors are treated in special case code.  This requires 4 mulx
47    per limb instead of 8 in the general case.
48 
49    For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
50    addressing, to get the two halves of each limb read in the correct order.
51    This is kept in an adj variable.  Doing that measures about 4 c/l faster
52    than just writing HALF_ENDIAN_ADJ(i) in the integer loop.  The latter
53    shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well
54    (on gcc 3.2.1 at least).  The fraction loop doesn't seem affected, but we
55    still use a variable since that ought to work out best.  */
56 
57 mp_limb_t
mpn_divrem_1(mp_ptr qp_limbptr,mp_size_t xsize_limbs,mp_srcptr ap_limbptr,mp_size_t size_limbs,mp_limb_t d_limb)58 mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs,
59               mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
60 {
61   mp_size_t  total_size_limbs;
62   mp_size_t  i;
63 
64   ASSERT (xsize_limbs >= 0);
65   ASSERT (size_limbs >= 0);
66   ASSERT (d_limb != 0);
67   /* FIXME: What's the correct overlap rule when xsize!=0? */
68   ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs,
69                                   ap_limbptr, size_limbs));
70 
71   total_size_limbs = size_limbs + xsize_limbs;
72   if (UNLIKELY (total_size_limbs == 0))
73     return 0;
74 
75   /* udivx is good for total_size==1, and no need to bother checking
76      limb<divisor, since if that's likely the caller should check */
77   if (UNLIKELY (total_size_limbs == 1))
78     {
79       mp_limb_t  a, q;
80       a = (LIKELY (size_limbs != 0) ? ap_limbptr[0] : 0);
81       q = a / d_limb;
82       qp_limbptr[0] = q;
83       return a - q*d_limb;
84     }
85 
86   if (d_limb <= CNST_LIMB(0xFFFFFFFF))
87     {
88       mp_size_t  size, xsize, total_size, adj;
89       unsigned   *qp, n1, n0, q, r, nshift, norm_rmask;
90       mp_limb_t  dinv_limb;
91       const unsigned *ap;
92       int        norm, norm_rshift;
93 
94       size = 2 * size_limbs;
95       xsize = 2 * xsize_limbs;
96       total_size = size + xsize;
97 
98       ap = (unsigned *) ap_limbptr;
99       qp = (unsigned *) qp_limbptr;
100 
101       qp += xsize;
102       r = 0;        /* initial remainder */
103 
104       if (LIKELY (size != 0))
105         {
106           n1 = ap[size-1 + HALF_ENDIAN_ADJ(1)];
107 
108           /* If the length of the source is uniformly distributed, then
109              there's a 50% chance of the high 32-bits being zero, which we
110              can skip.  */
111           if (n1 == 0)
112             {
113               n1 = ap[size-2 + HALF_ENDIAN_ADJ(0)];
114               total_size--;
115               size--;
116               ASSERT (size > 0);  /* because always even */
117               qp[size + HALF_ENDIAN_ADJ(1)] = 0;
118             }
119 
120           /* Skip a division if high < divisor (high quotient 0).  Testing
121              here before before normalizing will still skip as often as
122              possible.  */
123           if (n1 < d_limb)
124             {
125               r = n1;
126               size--;
127               qp[size + HALF_ENDIAN_ADJ(size)] = 0;
128               total_size--;
129               if (total_size == 0)
130                 return r;
131             }
132         }
133 
134       count_leading_zeros_32 (norm, d_limb);
135       norm -= 32;
136       d_limb <<= norm;
137       r <<= norm;
138 
139       norm_rshift = 32 - norm;
140       norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
141 
142       invert_half_limb (dinv_limb, d_limb);
143 
144       if (LIKELY (size != 0))
145         {
146           i = size - 1;
147           adj = HALF_ENDIAN_ADJ (i);
148           n1 = ap[i + adj];
149           adj = -adj;
150           r |= ((n1 >> norm_rshift) & norm_rmask);
151           for ( ; i > 0; i--)
152             {
153               n0 = ap[i-1 + adj];
154               adj = -adj;
155               nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
156               udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
157               qp[i + adj] = q;
158               n1 = n0;
159             }
160           nshift = n1 << norm;
161           udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
162           qp[0 + HALF_ENDIAN_ADJ(0)] = q;
163         }
164       qp -= xsize;
165       adj = HALF_ENDIAN_ADJ (0);
166       for (i = xsize-1; i >= 0; i--)
167         {
168           udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb);
169           adj = -adj;
170           qp[i + adj] = q;
171         }
172 
173       return r >> norm;
174     }
175   else
176     {
177       mp_srcptr  ap;
178       mp_ptr     qp;
179       mp_size_t  size, xsize, total_size;
180       mp_limb_t  d, n1, n0, q, r, dinv, nshift, norm_rmask;
181       int        norm, norm_rshift;
182 
183       ap = ap_limbptr;
184       qp = qp_limbptr;
185       size = size_limbs;
186       xsize = xsize_limbs;
187       total_size = total_size_limbs;
188       d = d_limb;
189 
190       qp += total_size;   /* above high limb */
191       r = 0;              /* initial remainder */
192 
193       if (LIKELY (size != 0))
194         {
195           /* Skip a division if high < divisor (high quotient 0).  Testing
196              here before before normalizing will still skip as often as
197              possible.  */
198           n1 = ap[size-1];
199           if (n1 < d)
200             {
201               r = n1;
202               *--qp = 0;
203               total_size--;
204               if (total_size == 0)
205                 return r;
206               size--;
207             }
208         }
209 
210       count_leading_zeros (norm, d);
211       d <<= norm;
212       r <<= norm;
213 
214       norm_rshift = GMP_LIMB_BITS - norm;
215       norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0));
216 
217       invert_limb (dinv, d);
218 
219       if (LIKELY (size != 0))
220         {
221           n1 = ap[size-1];
222           r |= ((n1 >> norm_rshift) & norm_rmask);
223           for (i = size-2; i >= 0; i--)
224             {
225               n0 = ap[i];
226               nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
227               udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
228               *--qp = q;
229               n1 = n0;
230             }
231           nshift = n1 << norm;
232           udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
233           *--qp = q;
234         }
235       for (i = 0; i < xsize; i++)
236         {
237           udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv);
238           *--qp = q;
239         }
240       return r >> norm;
241     }
242 }
243