155114Storek /* 255114Storek * Copyright (c) 1992 The Regents of the University of California. 355114Storek * All rights reserved. 455114Storek * 555114Storek * This software was developed by the Computer Systems Engineering group 655114Storek * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 755114Storek * contributed to Berkeley. 855114Storek * 9*55500Sbostic * All advertising materials mentioning features or use of this software 10*55500Sbostic * must display the following acknowledgement: 11*55500Sbostic * This product includes software developed by the University of 12*55500Sbostic * California, Lawrence Berkeley Laboratories. 13*55500Sbostic * 1455114Storek * %sccs.include.redist.c% 1555114Storek * 16*55500Sbostic * @(#)fpu_mul.c 7.2 (Berkeley) 07/21/92 1755114Storek * 1855114Storek * from: $Header: fpu_mul.c,v 1.2 92/06/17 05:41:34 torek Exp $ 1955114Storek */ 2055114Storek 2155114Storek /* 2255114Storek * Perform an FPU multiply (return x * y). 2355114Storek */ 2455114Storek 2555114Storek #include "sys/types.h" 2655114Storek 2755114Storek #include "machine/reg.h" 2855114Storek 2955114Storek #include "fpu_arith.h" 3055114Storek #include "fpu_emu.h" 3155114Storek 3255114Storek /* 3355114Storek * The multiplication algorithm for normal numbers is as follows: 3455114Storek * 3555114Storek * The fraction of the product is built in the usual stepwise fashion. 3655114Storek * Each step consists of shifting the accumulator right one bit 3755114Storek * (maintaining any guard bits) and, if the next bit in y is set, 3855114Storek * adding the multiplicand (x) to the accumulator. Then, in any case, 3955114Storek * we advance one bit leftward in y. Algorithmically: 4055114Storek * 4155114Storek * A = 0; 4255114Storek * for (bit = 0; bit < FP_NMANT; bit++) { 4355114Storek * sticky |= A & 1, A >>= 1; 4455114Storek * if (Y & (1 << bit)) 4555114Storek * A += X; 4655114Storek * } 4755114Storek * 4855114Storek * (X and Y here represent the mantissas of x and y respectively.) 4955114Storek * The resultant accumulator (A) is the product's mantissa. It may 5055114Storek * be as large as 11.11111... in binary and hence may need to be 5155114Storek * shifted right, but at most one bit. 5255114Storek * 5355114Storek * Since we do not have efficient multiword arithmetic, we code the 5455114Storek * accumulator as four separate words, just like any other mantissa. 5555114Storek * We use local `register' variables in the hope that this is faster 5655114Storek * than memory. We keep x->fp_mant in locals for the same reason. 5755114Storek * 5855114Storek * In the algorithm above, the bits in y are inspected one at a time. 5955114Storek * We will pick them up 32 at a time and then deal with those 32, one 6055114Storek * at a time. Note, however, that we know several things about y: 6155114Storek * 6255114Storek * - the guard and round bits at the bottom are sure to be zero; 6355114Storek * 6455114Storek * - often many low bits are zero (y is often from a single or double 6555114Storek * precision source); 6655114Storek * 6755114Storek * - bit FP_NMANT-1 is set, and FP_1*2 fits in a word. 6855114Storek * 6955114Storek * We can also test for 32-zero-bits swiftly. In this case, the center 7055114Storek * part of the loop---setting sticky, shifting A, and not adding---will 7155114Storek * run 32 times without adding X to A. We can do a 32-bit shift faster 7255114Storek * by simply moving words. Since zeros are common, we optimize this case. 7355114Storek * Furthermore, since A is initially zero, we can omit the shift as well 7455114Storek * until we reach a nonzero word. 7555114Storek */ 7655114Storek struct fpn * 7755114Storek fpu_mul(fe) 7855114Storek register struct fpemu *fe; 7955114Storek { 8055114Storek register struct fpn *x = &fe->fe_f1, *y = &fe->fe_f2; 8155114Storek register u_int a3, a2, a1, a0, x3, x2, x1, x0, bit, m; 8255114Storek register int sticky; 8355114Storek FPU_DECL_CARRY 8455114Storek 8555114Storek /* 8655114Storek * Put the `heavier' operand on the right (see fpu_emu.h). 8755114Storek * Then we will have one of the following cases, taken in the 8855114Storek * following order: 8955114Storek * 9055114Storek * - y = NaN. Implied: if only one is a signalling NaN, y is. 9155114Storek * The result is y. 9255114Storek * - y = Inf. Implied: x != NaN (is 0, number, or Inf: the NaN 9355114Storek * case was taken care of earlier). 9455114Storek * If x = 0, the result is NaN. Otherwise the result 9555114Storek * is y, with its sign reversed if x is negative. 9655114Storek * - x = 0. Implied: y is 0 or number. 9755114Storek * The result is 0 (with XORed sign as usual). 9855114Storek * - other. Implied: both x and y are numbers. 9955114Storek * The result is x * y (XOR sign, multiply bits, add exponents). 10055114Storek */ 10155114Storek ORDER(x, y); 10255114Storek if (ISNAN(y)) { 10355114Storek y->fp_sign ^= x->fp_sign; 10455114Storek return (y); 10555114Storek } 10655114Storek if (ISINF(y)) { 10755114Storek if (ISZERO(x)) 10855114Storek return (fpu_newnan(fe)); 10955114Storek y->fp_sign ^= x->fp_sign; 11055114Storek return (y); 11155114Storek } 11255114Storek if (ISZERO(x)) { 11355114Storek x->fp_sign ^= y->fp_sign; 11455114Storek return (x); 11555114Storek } 11655114Storek 11755114Storek /* 11855114Storek * Setup. In the code below, the mask `m' will hold the current 11955114Storek * mantissa byte from y. The variable `bit' denotes the bit 12055114Storek * within m. We also define some macros to deal with everything. 12155114Storek */ 12255114Storek x3 = x->fp_mant[3]; 12355114Storek x2 = x->fp_mant[2]; 12455114Storek x1 = x->fp_mant[1]; 12555114Storek x0 = x->fp_mant[0]; 12655114Storek sticky = a3 = a2 = a1 = a0 = 0; 12755114Storek 12855114Storek #define ADD /* A += X */ \ 12955114Storek FPU_ADDS(a3, a3, x3); \ 13055114Storek FPU_ADDCS(a2, a2, x2); \ 13155114Storek FPU_ADDCS(a1, a1, x1); \ 13255114Storek FPU_ADDC(a0, a0, x0) 13355114Storek 13455114Storek #define SHR1 /* A >>= 1, with sticky */ \ 13555114Storek sticky |= a3 & 1, a3 = (a3 >> 1) | (a2 << 31), \ 13655114Storek a2 = (a2 >> 1) | (a1 << 31), a1 = (a1 >> 1) | (a0 << 31), a0 >>= 1 13755114Storek 13855114Storek #define SHR32 /* A >>= 32, with sticky */ \ 13955114Storek sticky |= a3, a3 = a2, a2 = a1, a1 = a0, a0 = 0 14055114Storek 14155114Storek #define STEP /* each 1-bit step of the multiplication */ \ 14255114Storek SHR1; if (bit & m) { ADD; }; bit <<= 1 14355114Storek 14455114Storek /* 14555114Storek * We are ready to begin. The multiply loop runs once for each 14655114Storek * of the four 32-bit words. Some words, however, are special. 14755114Storek * As noted above, the low order bits of Y are often zero. Even 14855114Storek * if not, the first loop can certainly skip the guard bits. 14955114Storek * The last word of y has its highest 1-bit in position FP_NMANT-1, 15055114Storek * so we stop the loop when we move past that bit. 15155114Storek */ 15255114Storek if ((m = y->fp_mant[3]) == 0) { 15355114Storek /* SHR32; */ /* unneeded since A==0 */ 15455114Storek } else { 15555114Storek bit = 1 << FP_NG; 15655114Storek do { 15755114Storek STEP; 15855114Storek } while (bit != 0); 15955114Storek } 16055114Storek if ((m = y->fp_mant[2]) == 0) { 16155114Storek SHR32; 16255114Storek } else { 16355114Storek bit = 1; 16455114Storek do { 16555114Storek STEP; 16655114Storek } while (bit != 0); 16755114Storek } 16855114Storek if ((m = y->fp_mant[1]) == 0) { 16955114Storek SHR32; 17055114Storek } else { 17155114Storek bit = 1; 17255114Storek do { 17355114Storek STEP; 17455114Storek } while (bit != 0); 17555114Storek } 17655114Storek m = y->fp_mant[0]; /* definitely != 0 */ 17755114Storek bit = 1; 17855114Storek do { 17955114Storek STEP; 18055114Storek } while (bit <= m); 18155114Storek 18255114Storek /* 18355114Storek * Done with mantissa calculation. Get exponent and handle 18455114Storek * 11.111...1 case, then put result in place. We reuse x since 18555114Storek * it already has the right class (FP_NUM). 18655114Storek */ 18755114Storek m = x->fp_exp + y->fp_exp; 18855114Storek if (a0 >= FP_2) { 18955114Storek SHR1; 19055114Storek m++; 19155114Storek } 19255114Storek x->fp_sign ^= y->fp_sign; 19355114Storek x->fp_exp = m; 19455114Storek x->fp_sticky = sticky; 19555114Storek x->fp_mant[3] = a3; 19655114Storek x->fp_mant[2] = a2; 19755114Storek x->fp_mant[1] = a1; 19855114Storek x->fp_mant[0] = a0; 19955114Storek return (x); 20055114Storek } 201