xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc64/mode1o.c (revision e5d758f832e07a177fa24707c434b7ce26d0f762)
1 /* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder.
2 
3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5    FUTURE GNU MP RELEASES.
6 
7 Copyright 2000-2003 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 #include "gmp-impl.h"
36 #include "longlong.h"
37 
38 #include "mpn/sparc64/sparc64.h"
39 
40 
41 /*                 64-bit divisor   32-bit divisor
42                     cycles/limb      cycles/limb
43                      (approx)         (approx)
44    Ultrasparc 2i:       ?                ?
45 */
46 
47 
48 /* This implementation reduces the number of multiplies done, knowing that
49    on ultrasparc 1 and 2 the mulx instruction stalls the whole chip.
50 
51    The key idea is to use the fact that the low limb of q*d equals l, this
52    being the whole purpose of the q calculated.  It means there's no need to
53    calculate the lowest 32x32->64 part of the q*d, instead it can be
54    inferred from l and the other three 32x32->64 parts.  See sparc64.h for
55    details.
56 
57    When d is 32-bits, the same applies, but in this case there's only one
58    other 32x32->64 part (ie. HIGH(q)*d).
59 
60    The net effect is that for 64-bit divisor each limb is 4 mulx, or for
61    32-bit divisor each is 2 mulx.
62 
63    Enhancements:
64 
65    No doubt this could be done in assembler, if that helped the scheduling,
66    or perhaps guaranteed good code irrespective of the compiler.
67 
68    Alternatives:
69 
70    It might be possibly to use floating point.  The loop is dominated by
71    multiply latency, so not sure if floats would improve that.  One
72    possibility would be to take two limbs at a time, with a 128 bit inverse,
73    if there's enough registers, which could effectively use float throughput
74    to reduce total latency across two limbs.  */
75 
76 #define ASSERT_RETVAL(r)                \
77   ASSERT (orig_c < d ? r < d : r <= d)
78 
79 mp_limb_t
80 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c)
81 {
82   mp_limb_t  c = orig_c;
83   mp_limb_t  s, l, q, h, inverse;
84 
85   ASSERT (size >= 1);
86   ASSERT (d & 1);
87   ASSERT_MPN (src, size);
88   ASSERT_LIMB (d);
89   ASSERT_LIMB (c);
90 
91   /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */
92   if (size == 1)
93     {
94       s = src[0];
95       if (s > c)
96 	{
97 	  l = s-c;
98 	  h = l % d;
99 	  if (h != 0)
100 	    h = d - h;
101 	}
102       else
103 	{
104 	  l = c-s;
105 	  h = l % d;
106 	}
107       return h;
108     }
109 
110   binvert_limb (inverse, d);
111 
112   if (d <= 0xFFFFFFFF)
113     {
114       s = *src++;
115       size--;
116       do
117         {
118           SUBC_LIMB (c, l, s, c);
119           s = *src++;
120           q = l * inverse;
121           umul_ppmm_half_lowequal (h, q, d, l);
122           c += h;
123           size--;
124         }
125       while (size != 0);
126 
127       if (s <= d)
128         {
129           /* With high s <= d the final step can be a subtract and addback.
130              If c==0 then the addback will restore to l>=0.  If c==d then
131              will get l==d if s==0, but that's ok per the function
132              definition.  */
133 
134           l = c - s;
135           l += (l > c ? d : 0);
136 
137           ASSERT_RETVAL (l);
138           return l;
139         }
140       else
141         {
142           /* Can't skip a divide, just do the loop code once more. */
143           SUBC_LIMB (c, l, s, c);
144           q = l * inverse;
145           umul_ppmm_half_lowequal (h, q, d, l);
146           c += h;
147 
148           ASSERT_RETVAL (c);
149           return c;
150         }
151     }
152   else
153     {
154       mp_limb_t  dl = LOW32 (d);
155       mp_limb_t  dh = HIGH32 (d);
156       long i;
157 
158       s = *src++;
159       size--;
160       do
161         {
162           SUBC_LIMB (c, l, s, c);
163           s = *src++;
164           q = l * inverse;
165           umul_ppmm_lowequal (h, q, d, dh, dl, l);
166           c += h;
167           size--;
168         }
169       while (size != 0);
170 
171       if (s <= d)
172         {
173           /* With high s <= d the final step can be a subtract and addback.
174              If c==0 then the addback will restore to l>=0.  If c==d then
175              will get l==d if s==0, but that's ok per the function
176              definition.  */
177 
178           l = c - s;
179           l += (l > c ? d : 0);
180 
181           ASSERT_RETVAL (l);
182           return l;
183         }
184       else
185         {
186           /* Can't skip a divide, just do the loop code once more. */
187           SUBC_LIMB (c, l, s, c);
188           q = l * inverse;
189           umul_ppmm_lowequal (h, q, d, dh, dl, l);
190           c += h;
191 
192           ASSERT_RETVAL (c);
193           return c;
194         }
195     }
196 }
197