xref: /netbsd-src/external/gpl3/gcc/dist/libgcc/config/libbid/bid_div_macros.h (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Copyright (C) 2007-2022 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9 
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 for more details.
14 
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18 
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22 <http://www.gnu.org/licenses/>.  */
23 
24 #ifndef _DIV_MACROS_H_
25 #define _DIV_MACROS_H_
26 
27 #include "bid_internal.h"
28 
29 #define FENCE __fence
30 //#define FENCE
31 
32 //#define DOUBLE_EXTENDED_ON
33 
34 #if DOUBLE_EXTENDED_ON
35 
36 
37 __BID_INLINE__ void
__div_128_by_128(UINT128 * pCQ,UINT128 * pCR,UINT128 CX,UINT128 CY)38 __div_128_by_128 (UINT128 * pCQ, UINT128 * pCR, UINT128 CX, UINT128 CY) {
39   UINT128 CB, CB2, CB4, CB8, CQB, CA;
40   int_double d64, dm64, ds;
41   int_float t64;
42   double dx, dq, dqh;
43   BINARY80 lq, lx, ly;
44   UINT64 Rh, R, B2, B4, Ph, Ql, Ql2, carry, Qh;
45 
46   if (!CY.w[1]) {
47     pCR->w[1] = 0;
48 
49     if (!CX.w[1]) {
50       pCQ->w[0] = CX.w[0] / CY.w[0];
51       pCQ->w[1] = 0;
52       pCR->w[1] = 0;
53       pCR->w[0] = CX.w[0] - pCQ->w[0] * CY.w[0];
54     } else {
55 
56       // This path works for CX<2^116 only
57 
58       // 2^64
59       d64.i = 0x43f0000000000000;
60       // 2^64
61       dm64.i = 0x3bf0000000000000;
62       // 1.5*2^(-52)
63       ds.i = 0x3cb8000000000000;
64       dx = (BINARY80) CX.w[1] * d64.d + (BINARY80) CX.w[0];
65       dq = dx / (BINARY80) CY.w[0];
66       dq -= dq * (ds.d);
67       dqh = dq * dm64.d;
68       Qh = (UINT64) dqh;
69       Ql = (UINT64) (dq - ((double) Qh) * d64.d);
70 
71       Rh = CX.w[0] - Ql * CY.w[0];
72       Ql2 = Rh / CY.w[0];
73       pCR->w[0] = Rh - Ql2 * CY.w[0];
74       __add_carry_out ((pCQ->w[0]), carry, Ql, Ql2);
75       pCQ->w[1] = Qh + carry;
76 
77     }
78     return;
79   }
80   // now CY.w[1] > 0
81 
82   // 2^64
83   t64.i = 0x5f800000;
84   lx = (BINARY80) CX.w[1] * (BINARY80) t64.d + (BINARY80) CX.w[0];
85   ly = (BINARY80) CY.w[1] * (BINARY80) t64.d + (BINARY80) CY.w[0];
86   lq = lx / ly;
87   pCQ->w[0] = (UINT64) lq;
88 
89   pCQ->w[1] = 0;
90 
91   if (!pCQ->w[0]) {
92     /*if(__unsigned_compare_ge_128(CX,CY))
93        {
94        pCQ->w[0] = 1;
95        __sub_128_128((*pCR), CX, CY);
96        }
97        else */
98     {
99       pCR->w[1] = CX.w[1];
100       pCR->w[0] = CX.w[0];
101     }
102     return;
103   }
104 
105   if (CY.w[1] >= 16 || pCQ->w[0] <= 0x1000000000000000ull) {
106     pCQ->w[0] = (UINT64) lq - 1;
107     __mul_64x128_full (Ph, CQB, (pCQ->w[0]), CY);
108     __sub_128_128 (CA, CX, CQB);
109     if (__unsigned_compare_ge_128 (CA, CY)) {
110       __sub_128_128 (CA, CA, CY);
111       pCQ->w[0]++;
112       if (__unsigned_compare_ge_128 (CA, CY)) {
113 	__sub_128_128 (CA, CA, CY);
114 	pCQ->w[0]++;
115       }
116     }
117     pCR->w[1] = CA.w[1];
118     pCR->w[0] = CA.w[0];
119   } else {
120     pCQ->w[0] = (UINT64) lq - 6;
121 
122     __mul_64x128_full (Ph, CQB, (pCQ->w[0]), CY);
123     __sub_128_128 (CA, CX, CQB);
124 
125     CB8.w[1] = (CY.w[1] << 3) | (CY.w[0] >> 61);
126     CB8.w[0] = CY.w[0] << 3;
127     CB4.w[1] = (CY.w[1] << 2) | (CY.w[0] >> 62);
128     CB4.w[0] = CY.w[0] << 2;
129     CB2.w[1] = (CY.w[1] << 1) | (CY.w[0] >> 63);
130     CB2.w[0] = CY.w[0] << 1;
131 
132     if (__unsigned_compare_ge_128 (CA, CB8)) {
133       pCQ->w[0] += 8;
134       __sub_128_128 (CA, CA, CB8);
135     }
136     if (__unsigned_compare_ge_128 (CA, CB4)) {
137       pCQ->w[0] += 4;
138       __sub_128_128 (CA, CA, CB4);
139     }
140     if (__unsigned_compare_ge_128 (CA, CB2)) {
141       pCQ->w[0] += 2;
142       __sub_128_128 (CA, CA, CB2);
143     }
144     if (__unsigned_compare_ge_128 (CA, CY)) {
145       pCQ->w[0] += 1;
146       __sub_128_128 (CA, CA, CY);
147     }
148 
149     pCR->w[1] = CA.w[1];
150     pCR->w[0] = CA.w[0];
151   }
152 }
153 
154 
155 
156 
157 
158 
159 __BID_INLINE__ void
__div_256_by_128(UINT128 * pCQ,UINT256 * pCA4,UINT128 CY)160 __div_256_by_128 (UINT128 * pCQ, UINT256 * pCA4, UINT128 CY) {
161   UINT256 CQ2Y;
162   UINT128 CQ2, CQ3Y;
163   UINT64 Q3, carry64;
164   int_double d64;
165   BINARY80 lx, ly, lq, l64, l128;
166 
167   // 2^64
168   d64.i = 0x43f0000000000000ull;
169   l64 = (BINARY80) d64.d;
170   // 2^128
171   l128 = l64 * l64;
172 
173   lx =
174     ((BINARY80) (*pCA4).w[3] * l64 +
175      (BINARY80) (*pCA4).w[2]) * l128 +
176     (BINARY80) (*pCA4).w[1] * l64 + (BINARY80) (*pCA4).w[0];
177   ly = (BINARY80) CY.w[1] * l128 + (BINARY80) CY.w[0] * l64;
178 
179   lq = lx / ly;
180   CQ2.w[1] = (UINT64) lq;
181   lq = (lq - CQ2.w[1]) * l64;
182   CQ2.w[0] = (UINT64) lq;
183 
184   // CQ2*CY
185   __mul_128x128_to_256 (CQ2Y, CY, CQ2);
186 
187   // CQ2Y <= (*pCA4) ?
188   if (CQ2Y.w[3] < (*pCA4).w[3]
189       || (CQ2Y.w[3] == (*pCA4).w[3]
190 	  && (CQ2Y.w[2] < (*pCA4).w[2]
191 	      || (CQ2Y.w[2] == (*pCA4).w[2]
192 		  && (CQ2Y.w[1] < (*pCA4).w[1]
193 		      || (CQ2Y.w[1] == (*pCA4).w[1]
194 			  && (CQ2Y.w[0] <= (*pCA4).w[0]))))))) {
195 
196     // (*pCA4) -CQ2Y, guaranteed below 5*2^49*CY < 5*2^(49+128)
197     __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CQ2Y.w[0]);
198     __sub_borrow_in_out ((*pCA4).w[1], carry64, (*pCA4).w[1], CQ2Y.w[1],
199 			 carry64);
200     (*pCA4).w[2] = (*pCA4).w[2] - CQ2Y.w[2] - carry64;
201 
202     lx = ((BINARY80) (*pCA4).w[2] * l128 +
203 	  ((BINARY80) (*pCA4).w[1] * l64 +
204 	   (BINARY80) (*pCA4).w[0])) * l64;
205     lq = lx / ly;
206     Q3 = (UINT64) lq;
207 
208     if (Q3) {
209       Q3--;
210       __mul_64x128_short (CQ3Y, Q3, CY);
211       __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CQ3Y.w[0]);
212       (*pCA4).w[1] = (*pCA4).w[1] - CQ3Y.w[1] - carry64;
213 
214       if ((*pCA4).w[1] > CY.w[1]
215 	  || ((*pCA4).w[1] == CY.w[1] && (*pCA4).w[0] >= CY.w[0])) {
216 	Q3++;
217 	__sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CY.w[0]);
218 	(*pCA4).w[1] = (*pCA4).w[1] - CY.w[1] - carry64;
219 	if ((*pCA4).w[1] > CY.w[1]
220 	    || ((*pCA4).w[1] == CY.w[1] && (*pCA4).w[0] >= CY.w[0])) {
221 	  Q3++;
222 	  __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0],
223 			    CY.w[0]);
224 	  (*pCA4).w[1] = (*pCA4).w[1] - CY.w[1] - carry64;
225 	}
226       }
227       // add Q3 to Q2
228       __add_carry_out (CQ2.w[0], carry64, Q3, CQ2.w[0]);
229       CQ2.w[1] += carry64;
230     }
231   } else {
232     // CQ2Y - (*pCA4), guaranteed below 5*2^(49+128)
233     __sub_borrow_out ((*pCA4).w[0], carry64, CQ2Y.w[0], (*pCA4).w[0]);
234     __sub_borrow_in_out ((*pCA4).w[1], carry64, CQ2Y.w[1], (*pCA4).w[1],
235 			 carry64);
236     (*pCA4).w[2] = CQ2Y.w[2] - (*pCA4).w[2] - carry64;
237 
238     lx =
239       ((BINARY80) (*pCA4).w[2] * l128 +
240        (BINARY80) (*pCA4).w[1] * l64 + (BINARY80) (*pCA4).w[0]) * l64;
241     lq = lx / ly;
242     Q3 = 1 + (UINT64) lq;
243 
244     __mul_64x128_short (CQ3Y, Q3, CY);
245     __sub_borrow_out ((*pCA4).w[0], carry64, CQ3Y.w[0], (*pCA4).w[0]);
246     (*pCA4).w[1] = CQ3Y.w[1] - (*pCA4).w[1] - carry64;
247 
248     if ((SINT64) (*pCA4).w[1] > (SINT64) CY.w[1]
249 	|| ((*pCA4).w[1] == CY.w[1] && (*pCA4).w[0] >= CY.w[0])) {
250       Q3--;
251       __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CY.w[0]);
252       (*pCA4).w[1] = (*pCA4).w[1] - CY.w[1] - carry64;
253     } else if ((SINT64) (*pCA4).w[1] < 0) {
254       Q3++;
255       __add_carry_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CY.w[0]);
256       (*pCA4).w[1] = (*pCA4).w[1] + CY.w[1] + carry64;
257     }
258     // subtract Q3 from Q2
259     __sub_borrow_out (CQ2.w[0], carry64, CQ2.w[0], Q3);
260     CQ2.w[1] -= carry64;
261   }
262 
263   // (*pCQ) + CQ2 + carry
264   __add_carry_out ((*pCQ).w[0], carry64, CQ2.w[0], (*pCQ).w[0]);
265   (*pCQ).w[1] = (*pCQ).w[1] + CQ2.w[1] + carry64;
266 
267 
268 }
269 #else
270 
271 __BID_INLINE__ void
__div_128_by_128(UINT128 * pCQ,UINT128 * pCR,UINT128 CX0,UINT128 CY)272 __div_128_by_128 (UINT128 * pCQ, UINT128 * pCR, UINT128 CX0, UINT128 CY) {
273   UINT128 CY36, CY51, CQ, A2, CX, CQT;
274   UINT64 Q;
275   int_double t64, d49, d60;
276   double lx, ly, lq;
277 
278   if (!CX0.w[1] && !CY.w[1]) {
279     pCQ->w[0] = CX0.w[0] / CY.w[0];
280     pCQ->w[1] = 0;
281     pCR->w[1] = pCR->w[0] = 0;
282     pCR->w[0] = CX0.w[0] - pCQ->w[0] * CY.w[0];
283     return;
284   }
285 
286   CX.w[1] = CX0.w[1];
287   CX.w[0] = CX0.w[0];
288 
289   // 2^64
290   t64.i = 0x43f0000000000000ull;
291   lx = (double) CX.w[1] * t64.d + (double) CX.w[0];
292   ly = (double) CY.w[1] * t64.d + (double) CY.w[0];
293   lq = lx / ly;
294 
295   CY36.w[1] = CY.w[0] >> (64 - 36);
296   CY36.w[0] = CY.w[0] << 36;
297 
298   CQ.w[1] = CQ.w[0] = 0;
299 
300   // Q >= 2^100 ?
301   if (!CY.w[1] && !CY36.w[1] && (CX.w[1] >= CY36.w[0])) {
302     // then Q >= 2^100
303 
304     // 2^(-60)*CX/CY
305     d60.i = 0x3c30000000000000ull;
306     lq *= d60.d;
307     Q = (UINT64) lq - 4ull;
308 
309     // Q*CY
310     __mul_64x64_to_128 (A2, Q, CY.w[0]);
311 
312     // A2 <<= 60
313     A2.w[1] = (A2.w[1] << 60) | (A2.w[0] >> (64 - 60));
314     A2.w[0] <<= 60;
315 
316     __sub_128_128 (CX, CX, A2);
317 
318     lx = (double) CX.w[1] * t64.d + (double) CX.w[0];
319     lq = lx / ly;
320 
321     CQ.w[1] = Q >> (64 - 60);
322     CQ.w[0] = Q << 60;
323   }
324 
325 
326   CY51.w[1] = (CY.w[1] << 51) | (CY.w[0] >> (64 - 51));
327   CY51.w[0] = CY.w[0] << 51;
328 
329   if (CY.w[1] < (UINT64) (1 << (64 - 51))
330       && (__unsigned_compare_gt_128 (CX, CY51))) {
331     // Q > 2^51
332 
333     // 2^(-49)*CX/CY
334     d49.i = 0x3ce0000000000000ull;
335     lq *= d49.d;
336 
337     Q = (UINT64) lq - 1ull;
338 
339     // Q*CY
340     __mul_64x64_to_128 (A2, Q, CY.w[0]);
341     A2.w[1] += Q * CY.w[1];
342 
343     // A2 <<= 49
344     A2.w[1] = (A2.w[1] << 49) | (A2.w[0] >> (64 - 49));
345     A2.w[0] <<= 49;
346 
347     __sub_128_128 (CX, CX, A2);
348 
349     CQT.w[1] = Q >> (64 - 49);
350     CQT.w[0] = Q << 49;
351     __add_128_128 (CQ, CQ, CQT);
352 
353     lx = (double) CX.w[1] * t64.d + (double) CX.w[0];
354     lq = lx / ly;
355   }
356 
357   Q = (UINT64) lq;
358 
359   __mul_64x64_to_128 (A2, Q, CY.w[0]);
360   A2.w[1] += Q * CY.w[1];
361 
362   __sub_128_128 (CX, CX, A2);
363   if ((SINT64) CX.w[1] < 0) {
364     Q--;
365     CX.w[0] += CY.w[0];
366     if (CX.w[0] < CY.w[0])
367       CX.w[1]++;
368     CX.w[1] += CY.w[1];
369     if ((SINT64) CX.w[1] < 0) {
370       Q--;
371       CX.w[0] += CY.w[0];
372       if (CX.w[0] < CY.w[0])
373 	CX.w[1]++;
374       CX.w[1] += CY.w[1];
375     }
376   } else if (__unsigned_compare_ge_128 (CX, CY)) {
377     Q++;
378     __sub_128_128 (CX, CX, CY);
379   }
380 
381   __add_128_64 (CQ, CQ, Q);
382 
383 
384   pCQ->w[1] = CQ.w[1];
385   pCQ->w[0] = CQ.w[0];
386   pCR->w[1] = CX.w[1];
387   pCR->w[0] = CX.w[0];
388   return;
389 }
390 
391 
392 __BID_INLINE__ void
__div_256_by_128(UINT128 * pCQ,UINT256 * pCA4,UINT128 CY)393 __div_256_by_128 (UINT128 * pCQ, UINT256 * pCA4, UINT128 CY) {
394   UINT256 CA4, CA2, CY51, CY36;
395   UINT128 CQ, A2, A2h, CQT;
396   UINT64 Q, carry64;
397   int_double t64, d49, d60;
398   double lx, ly, lq, d128, d192;
399 
400   // the quotient is assumed to be at most 113 bits,
401   // as needed by BID128 divide routines
402 
403   // initial dividend
404   CA4.w[3] = (*pCA4).w[3];
405   CA4.w[2] = (*pCA4).w[2];
406   CA4.w[1] = (*pCA4).w[1];
407   CA4.w[0] = (*pCA4).w[0];
408   CQ.w[1] = (*pCQ).w[1];
409   CQ.w[0] = (*pCQ).w[0];
410 
411   // 2^64
412   t64.i = 0x43f0000000000000ull;
413   d128 = t64.d * t64.d;
414   d192 = d128 * t64.d;
415   lx = (double) CA4.w[3] * d192 + ((double) CA4.w[2] * d128 +
416 				   ((double) CA4.w[1] * t64.d +
417 				    (double) CA4.w[0]));
418   ly = (double) CY.w[1] * t64.d + (double) CY.w[0];
419   lq = lx / ly;
420 
421   CY36.w[2] = CY.w[1] >> (64 - 36);
422   CY36.w[1] = (CY.w[1] << 36) | (CY.w[0] >> (64 - 36));
423   CY36.w[0] = CY.w[0] << 36;
424 
425   CQ.w[1] = (*pCQ).w[1];
426   CQ.w[0] = (*pCQ).w[0];
427 
428   // Q >= 2^100 ?
429   if (CA4.w[3] > CY36.w[2]
430       || (CA4.w[3] == CY36.w[2]
431 	  && (CA4.w[2] > CY36.w[1]
432 	      || (CA4.w[2] == CY36.w[1] && CA4.w[1] >= CY36.w[0])))) {
433     // 2^(-60)*CA4/CY
434     d60.i = 0x3c30000000000000ull;
435     lq *= d60.d;
436     Q = (UINT64) lq - 4ull;
437 
438     // Q*CY
439     __mul_64x128_to_192 (CA2, Q, CY);
440 
441     // CA2 <<= 60
442     // CA2.w[3] = CA2.w[2] >> (64-60);
443     CA2.w[2] = (CA2.w[2] << 60) | (CA2.w[1] >> (64 - 60));
444     CA2.w[1] = (CA2.w[1] << 60) | (CA2.w[0] >> (64 - 60));
445     CA2.w[0] <<= 60;
446 
447     // CA4 -= CA2
448     __sub_borrow_out (CA4.w[0], carry64, CA4.w[0], CA2.w[0]);
449     __sub_borrow_in_out (CA4.w[1], carry64, CA4.w[1], CA2.w[1],
450 			 carry64);
451     CA4.w[2] = CA4.w[2] - CA2.w[2] - carry64;
452 
453     lx = ((double) CA4.w[2] * d128 +
454 	  ((double) CA4.w[1] * t64.d + (double) CA4.w[0]));
455     lq = lx / ly;
456 
457     CQT.w[1] = Q >> (64 - 60);
458     CQT.w[0] = Q << 60;
459     __add_128_128 (CQ, CQ, CQT);
460   }
461 
462   CY51.w[2] = CY.w[1] >> (64 - 51);
463   CY51.w[1] = (CY.w[1] << 51) | (CY.w[0] >> (64 - 51));
464   CY51.w[0] = CY.w[0] << 51;
465 
466   if (CA4.w[2] > CY51.w[2] || ((CA4.w[2] == CY51.w[2])
467 			       &&
468 			       (__unsigned_compare_gt_128 (CA4, CY51))))
469   {
470     // Q > 2^51
471 
472     // 2^(-49)*CA4/CY
473     d49.i = 0x3ce0000000000000ull;
474     lq *= d49.d;
475 
476     Q = (UINT64) lq - 1ull;
477 
478     // Q*CY
479     __mul_64x64_to_128 (A2, Q, CY.w[0]);
480     __mul_64x64_to_128 (A2h, Q, CY.w[1]);
481     A2.w[1] += A2h.w[0];
482     if (A2.w[1] < A2h.w[0])
483       A2h.w[1]++;
484 
485     // A2 <<= 49
486     CA2.w[2] = (A2h.w[1] << 49) | (A2.w[1] >> (64 - 49));
487     CA2.w[1] = (A2.w[1] << 49) | (A2.w[0] >> (64 - 49));
488     CA2.w[0] = A2.w[0] << 49;
489 
490     __sub_borrow_out (CA4.w[0], carry64, CA4.w[0], CA2.w[0]);
491     __sub_borrow_in_out (CA4.w[1], carry64, CA4.w[1], CA2.w[1],
492 			 carry64);
493     CA4.w[2] = CA4.w[2] - CA2.w[2] - carry64;
494 
495     CQT.w[1] = Q >> (64 - 49);
496     CQT.w[0] = Q << 49;
497     __add_128_128 (CQ, CQ, CQT);
498 
499     lx = ((double) CA4.w[2] * d128 +
500 	  ((double) CA4.w[1] * t64.d + (double) CA4.w[0]));
501     lq = lx / ly;
502   }
503 
504   Q = (UINT64) lq;
505   __mul_64x64_to_128 (A2, Q, CY.w[0]);
506   A2.w[1] += Q * CY.w[1];
507 
508   __sub_128_128 (CA4, CA4, A2);
509   if ((SINT64) CA4.w[1] < 0) {
510     Q--;
511     CA4.w[0] += CY.w[0];
512     if (CA4.w[0] < CY.w[0])
513       CA4.w[1]++;
514     CA4.w[1] += CY.w[1];
515     if ((SINT64) CA4.w[1] < 0) {
516       Q--;
517       CA4.w[0] += CY.w[0];
518       if (CA4.w[0] < CY.w[0])
519 	CA4.w[1]++;
520       CA4.w[1] += CY.w[1];
521     }
522   } else if (__unsigned_compare_ge_128 (CA4, CY)) {
523     Q++;
524     __sub_128_128 (CA4, CA4, CY);
525   }
526 
527   __add_128_64 (CQ, CQ, Q);
528 
529   pCQ->w[1] = CQ.w[1];
530   pCQ->w[0] = CQ.w[0];
531   pCA4->w[1] = CA4.w[1];
532   pCA4->w[0] = CA4.w[0];
533   return;
534 
535 
536 
537 }
538 
539 #endif
540 #endif
541