1*55112Storek /* 2*55112Storek * Copyright (c) 1992 The Regents of the University of California. 3*55112Storek * All rights reserved. 4*55112Storek * 5*55112Storek * This software was developed by the Computer Systems Engineering group 6*55112Storek * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 7*55112Storek * contributed to Berkeley. 8*55112Storek * 9*55112Storek * %sccs.include.redist.c% 10*55112Storek * 11*55112Storek * @(#)fpu_div.c 7.1 (Berkeley) 07/13/92 12*55112Storek * 13*55112Storek * from: $Header: fpu_div.c,v 1.2 92/06/17 05:41:30 torek Exp $ 14*55112Storek */ 15*55112Storek 16*55112Storek /* 17*55112Storek * Perform an FPU divide (return x / y). 18*55112Storek */ 19*55112Storek 20*55112Storek #include "sys/types.h" 21*55112Storek 22*55112Storek #include "machine/reg.h" 23*55112Storek 24*55112Storek #include "fpu_arith.h" 25*55112Storek #include "fpu_emu.h" 26*55112Storek 27*55112Storek /* 28*55112Storek * Division of normal numbers is done as follows: 29*55112Storek * 30*55112Storek * x and y are floating point numbers, i.e., in the form 1.bbbb * 2^e. 31*55112Storek * If X and Y are the mantissas (1.bbbb's), the quotient is then: 32*55112Storek * 33*55112Storek * q = (X / Y) * 2^((x exponent) - (y exponent)) 34*55112Storek * 35*55112Storek * Since X and Y are both in [1.0,2.0), the quotient's mantissa (X / Y) 36*55112Storek * will be in [0.5,2.0). Moreover, it will be less than 1.0 if and only 37*55112Storek * if X < Y. In that case, it will have to be shifted left one bit to 38*55112Storek * become a normal number, and the exponent decremented. Thus, the 39*55112Storek * desired exponent is: 40*55112Storek * 41*55112Storek * left_shift = x->fp_mant < y->fp_mant; 42*55112Storek * result_exp = x->fp_exp - y->fp_exp - left_shift; 43*55112Storek * 44*55112Storek * The quotient mantissa X/Y can then be computed one bit at a time 45*55112Storek * using the following algorithm: 46*55112Storek * 47*55112Storek * Q = 0; -- Initial quotient. 48*55112Storek * R = X; -- Initial remainder, 49*55112Storek * if (left_shift) -- but fixed up in advance. 50*55112Storek * R *= 2; 51*55112Storek * for (bit = FP_NMANT; --bit >= 0; R *= 2) { 52*55112Storek * if (R >= Y) { 53*55112Storek * Q |= 1 << bit; 54*55112Storek * R -= Y; 55*55112Storek * } 56*55112Storek * } 57*55112Storek * 58*55112Storek * The subtraction R -= Y always removes the uppermost bit from R (and 59*55112Storek * can sometimes remove additional lower-order 1 bits); this proof is 60*55112Storek * left to the reader. 61*55112Storek * 62*55112Storek * This loop correctly calculates the guard and round bits since they are 63*55112Storek * included in the expanded internal representation. The sticky bit 64*55112Storek * is to be set if and only if any other bits beyond guard and round 65*55112Storek * would be set. From the above it is obvious that this is true if and 66*55112Storek * only if the remainder R is nonzero when the loop terminates. 67*55112Storek * 68*55112Storek * Examining the loop above, we can see that the quotient Q is built 69*55112Storek * one bit at a time ``from the top down''. This means that we can 70*55112Storek * dispense with the multi-word arithmetic and just build it one word 71*55112Storek * at a time, writing each result word when it is done. 72*55112Storek * 73*55112Storek * Furthermore, since X and Y are both in [1.0,2.0), we know that, 74*55112Storek * initially, R >= Y. (Recall that, if X < Y, R is set to X * 2 and 75*55112Storek * is therefore at in [2.0,4.0).) Thus Q is sure to have bit FP_NMANT-1 76*55112Storek * set, and R can be set initially to either X - Y (when X >= Y) or 77*55112Storek * 2X - Y (when X < Y). In addition, comparing R and Y is difficult, 78*55112Storek * so we will simply calculate R - Y and see if that underflows. 79*55112Storek * This leads to the following revised version of the algorithm: 80*55112Storek * 81*55112Storek * R = X; 82*55112Storek * bit = FP_1; 83*55112Storek * D = R - Y; 84*55112Storek * if (D >= 0) { 85*55112Storek * result_exp = x->fp_exp - y->fp_exp; 86*55112Storek * R = D; 87*55112Storek * q = bit; 88*55112Storek * bit >>= 1; 89*55112Storek * } else { 90*55112Storek * result_exp = x->fp_exp - y->fp_exp - 1; 91*55112Storek * q = 0; 92*55112Storek * } 93*55112Storek * R <<= 1; 94*55112Storek * do { 95*55112Storek * D = R - Y; 96*55112Storek * if (D >= 0) { 97*55112Storek * q |= bit; 98*55112Storek * R = D; 99*55112Storek * } 100*55112Storek * R <<= 1; 101*55112Storek * } while ((bit >>= 1) != 0); 102*55112Storek * Q[0] = q; 103*55112Storek * for (i = 1; i < 4; i++) { 104*55112Storek * q = 0, bit = 1 << 31; 105*55112Storek * do { 106*55112Storek * D = R - Y; 107*55112Storek * if (D >= 0) { 108*55112Storek * q |= bit; 109*55112Storek * R = D; 110*55112Storek * } 111*55112Storek * R <<= 1; 112*55112Storek * } while ((bit >>= 1) != 0); 113*55112Storek * Q[i] = q; 114*55112Storek * } 115*55112Storek * 116*55112Storek * This can be refined just a bit further by moving the `R <<= 1' 117*55112Storek * calculations to the front of the do-loops and eliding the first one. 118*55112Storek * The process can be terminated immediately whenever R becomes 0, but 119*55112Storek * this is relatively rare, and we do not bother. 120*55112Storek */ 121*55112Storek 122*55112Storek struct fpn * 123*55112Storek fpu_div(fe) 124*55112Storek register struct fpemu *fe; 125*55112Storek { 126*55112Storek register struct fpn *x = &fe->fe_f1, *y = &fe->fe_f2; 127*55112Storek register u_int q, bit; 128*55112Storek register u_int r0, r1, r2, r3, d0, d1, d2, d3, y0, y1, y2, y3; 129*55112Storek FPU_DECL_CARRY 130*55112Storek 131*55112Storek /* 132*55112Storek * Since divide is not commutative, we cannot just use ORDER. 133*55112Storek * Check either operand for NaN first; if there is at least one, 134*55112Storek * order the signalling one (if only one) onto the right, then 135*55112Storek * return it. Otherwise we have the following cases: 136*55112Storek * 137*55112Storek * Inf / Inf = NaN, plus NV exception 138*55112Storek * Inf / num = Inf [i.e., return x] 139*55112Storek * Inf / 0 = Inf [i.e., return x] 140*55112Storek * 0 / Inf = 0 [i.e., return x] 141*55112Storek * 0 / num = 0 [i.e., return x] 142*55112Storek * 0 / 0 = NaN, plus NV exception 143*55112Storek * num / Inf = 0 144*55112Storek * num / num = num (do the divide) 145*55112Storek * num / 0 = Inf, plus DZ exception 146*55112Storek */ 147*55112Storek if (ISNAN(x) || ISNAN(y)) { 148*55112Storek ORDER(x, y); 149*55112Storek return (y); 150*55112Storek } 151*55112Storek if (ISINF(x) || ISZERO(x)) { 152*55112Storek if (x->fp_class == y->fp_class) 153*55112Storek return (fpu_newnan(fe)); 154*55112Storek return (x); 155*55112Storek } 156*55112Storek 157*55112Storek /* all results at this point use XOR of operand signs */ 158*55112Storek x->fp_sign ^= y->fp_sign; 159*55112Storek if (ISINF(y)) { 160*55112Storek x->fp_class = FPC_ZERO; 161*55112Storek return (x); 162*55112Storek } 163*55112Storek if (ISZERO(y)) { 164*55112Storek fe->fe_cx = FSR_DZ; 165*55112Storek x->fp_class = FPC_INF; 166*55112Storek return (x); 167*55112Storek } 168*55112Storek 169*55112Storek /* 170*55112Storek * Macros for the divide. See comments at top for algorithm. 171*55112Storek * Note that we expand R, D, and Y here. 172*55112Storek */ 173*55112Storek 174*55112Storek #define SUBTRACT /* D = R - Y */ \ 175*55112Storek FPU_SUBS(d3, r3, y3); FPU_SUBCS(d2, r2, y2); \ 176*55112Storek FPU_SUBCS(d1, r1, y1); FPU_SUBC(d0, r0, y0) 177*55112Storek 178*55112Storek #define NONNEGATIVE /* D >= 0 */ \ 179*55112Storek ((int)d0 >= 0) 180*55112Storek 181*55112Storek #ifdef FPU_SHL1_BY_ADD 182*55112Storek #define SHL1 /* R <<= 1 */ \ 183*55112Storek FPU_ADDS(r3, r3, r3); FPU_ADDCS(r2, r2, r2); \ 184*55112Storek FPU_ADDCS(r1, r1, r1); FPU_ADDC(r0, r0, r0) 185*55112Storek #else 186*55112Storek #define SHL1 \ 187*55112Storek r0 = (r0 << 1) | (r1 >> 31), r1 = (r1 << 1) | (r2 >> 31), \ 188*55112Storek r2 = (r2 << 1) | (r3 >> 31), r3 <<= 1 189*55112Storek #endif 190*55112Storek 191*55112Storek #define LOOP /* do ... while (bit >>= 1) */ \ 192*55112Storek do { \ 193*55112Storek SHL1; \ 194*55112Storek SUBTRACT; \ 195*55112Storek if (NONNEGATIVE) { \ 196*55112Storek q |= bit; \ 197*55112Storek r0 = d0, r1 = d1, r2 = d2, r3 = d3; \ 198*55112Storek } \ 199*55112Storek } while ((bit >>= 1) != 0) 200*55112Storek 201*55112Storek #define WORD(r, i) /* calculate r->fp_mant[i] */ \ 202*55112Storek q = 0; \ 203*55112Storek bit = 1 << 31; \ 204*55112Storek LOOP; \ 205*55112Storek (x)->fp_mant[i] = q 206*55112Storek 207*55112Storek /* Setup. Note that we put our result in x. */ 208*55112Storek r0 = x->fp_mant[0]; 209*55112Storek r1 = x->fp_mant[1]; 210*55112Storek r2 = x->fp_mant[2]; 211*55112Storek r3 = x->fp_mant[3]; 212*55112Storek y0 = y->fp_mant[0]; 213*55112Storek y1 = y->fp_mant[1]; 214*55112Storek y2 = y->fp_mant[2]; 215*55112Storek y3 = y->fp_mant[3]; 216*55112Storek 217*55112Storek bit = FP_1; 218*55112Storek SUBTRACT; 219*55112Storek if (NONNEGATIVE) { 220*55112Storek x->fp_exp -= y->fp_exp; 221*55112Storek r0 = d0, r1 = d1, r2 = d2, r3 = d3; 222*55112Storek q = bit; 223*55112Storek bit >>= 1; 224*55112Storek } else { 225*55112Storek x->fp_exp -= y->fp_exp + 1; 226*55112Storek q = 0; 227*55112Storek } 228*55112Storek LOOP; 229*55112Storek x->fp_mant[0] = q; 230*55112Storek WORD(x, 1); 231*55112Storek WORD(x, 2); 232*55112Storek WORD(x, 3); 233*55112Storek x->fp_sticky = r0 | r1 | r2 | r3; 234*55112Storek 235*55112Storek return (x); 236*55112Storek } 237