sparc64/fpu/fpu_mul.c

*6244ddccSmiod/*	$OpenBSD: fpu_mul.c,v 1.4 2024/03/29 21:02:11 miod Exp $	*/
02b90beaSjason
02b90beaSjason/*
02b90beaSjason * Copyright (c) 1992, 1993
02b90beaSjason *	The Regents of the University of California.  All rights reserved.
02b90beaSjason *
02b90beaSjason * This software was developed by the Computer Systems Engineering group
02b90beaSjason * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
02b90beaSjason * contributed to Berkeley.
02b90beaSjason *
02b90beaSjason * All advertising materials mentioning features or use of this software
02b90beaSjason * must display the following acknowledgement:
02b90beaSjason *	This product includes software developed by the University of
02b90beaSjason *	California, Lawrence Berkeley Laboratory.
02b90beaSjason *
02b90beaSjason * Redistribution and use in source and binary forms, with or without
02b90beaSjason * modification, are permitted provided that the following conditions
02b90beaSjason * are met:
02b90beaSjason * 1. Redistributions of source code must retain the above copyright
02b90beaSjason *    notice, this list of conditions and the following disclaimer.
02b90beaSjason * 2. Redistributions in binary form must reproduce the above copyright
02b90beaSjason *    notice, this list of conditions and the following disclaimer in the
02b90beaSjason *    documentation and/or other materials provided with the distribution.
02b90beaSjason * 3. All advertising materials mentioning features or use of this software
02b90beaSjason *    must display the following acknowledgement:
02b90beaSjason *	This product includes software developed by the University of
02b90beaSjason *	California, Berkeley and its contributors.
02b90beaSjason * 4. Neither the name of the University nor the names of its contributors
02b90beaSjason *    may be used to endorse or promote products derived from this software
02b90beaSjason *    without specific prior written permission.
02b90beaSjason *
02b90beaSjason * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
02b90beaSjason * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
02b90beaSjason * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
02b90beaSjason * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
02b90beaSjason * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
02b90beaSjason * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
02b90beaSjason * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
02b90beaSjason * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
02b90beaSjason * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
02b90beaSjason * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
02b90beaSjason * SUCH DAMAGE.
02b90beaSjason *
02b90beaSjason *	@(#)fpu_mul.c	8.1 (Berkeley) 6/11/93
02b90beaSjason *	$NetBSD: fpu_mul.c,v 1.2 1994/11/20 20:52:44 deraadt Exp $
02b90beaSjason */
02b90beaSjason
02b90beaSjason/*
02b90beaSjason * Perform an FPU multiply (return x * y).
02b90beaSjason */
02b90beaSjason
02b90beaSjason#include <sys/types.h>
02b90beaSjason
02b90beaSjason#include "fpu_arith.h"
02b90beaSjason#include "fpu_emu.h"
02b90beaSjason#include "fpu_extern.h"
02b90beaSjason
02b90beaSjason/*
02b90beaSjason * The multiplication algorithm for normal numbers is as follows:
02b90beaSjason *
02b90beaSjason * The fraction of the product is built in the usual stepwise fashion.
02b90beaSjason * Each step consists of shifting the accumulator right one bit
02b90beaSjason * (maintaining any guard bits) and, if the next bit in y is set,
02b90beaSjason * adding the multiplicand (x) to the accumulator.  Then, in any case,
02b90beaSjason * we advance one bit leftward in y.  Algorithmically:
02b90beaSjason *
02b90beaSjason *	A = 0;
02b90beaSjason *	for (bit = 0; bit < FP_NMANT; bit++) {
02b90beaSjason *		sticky |= A & 1, A >>= 1;
02b90beaSjason *		if (Y & (1 << bit))
02b90beaSjason *			A += X;
02b90beaSjason *	}
02b90beaSjason *
02b90beaSjason * (X and Y here represent the mantissas of x and y respectively.)
02b90beaSjason * The resultant accumulator (A) is the product's mantissa.  It may
02b90beaSjason * be as large as 11.11111... in binary and hence may need to be
02b90beaSjason * shifted right, but at most one bit.
02b90beaSjason *
02b90beaSjason * Since we do not have efficient multiword arithmetic, we code the
02b90beaSjason * accumulator as four separate words, just like any other mantissa.
02b90beaSjason * We use local `register' variables in the hope that this is faster
02b90beaSjason * than memory.  We keep x->fp_mant in locals for the same reason.
02b90beaSjason *
02b90beaSjason * In the algorithm above, the bits in y are inspected one at a time.
02b90beaSjason * We will pick them up 32 at a time and then deal with those 32, one
02b90beaSjason * at a time.  Note, however, that we know several things about y:
02b90beaSjason *
02b90beaSjason *    - the guard and round bits at the bottom are sure to be zero;
02b90beaSjason *
02b90beaSjason *    - often many low bits are zero (y is often from a single or double
02b90beaSjason *	precision source);
02b90beaSjason *
02b90beaSjason *    - bit FP_NMANT-1 is set, and FP_1*2 fits in a word.
02b90beaSjason *
02b90beaSjason * We can also test for 32-zero-bits swiftly.  In this case, the center
02b90beaSjason * part of the loop---setting sticky, shifting A, and not adding---will
02b90beaSjason * run 32 times without adding X to A.  We can do a 32-bit shift faster
02b90beaSjason * by simply moving words.  Since zeros are common, we optimize this case.
02b90beaSjason * Furthermore, since A is initially zero, we can omit the shift as well
02b90beaSjason * until we reach a nonzero word.
02b90beaSjason */
02b90beaSjasonstruct fpn *
02b90beaSjason__fpu_mul(fe)
02b90beaSjason	struct fpemu *fe;
02b90beaSjason{
02b90beaSjason	struct fpn *x = &fe->fe_f1, *y = &fe->fe_f2;
02b90beaSjason	u_int a3, a2, a1, a0, x3, x2, x1, x0, bit, m;
02b90beaSjason	int sticky;
02b90beaSjason	FPU_DECL_CARRY
02b90beaSjason
02b90beaSjason	/*
02b90beaSjason	 * Put the `heavier' operand on the right (see fpu_emu.h).
02b90beaSjason	 * Then we will have one of the following cases, taken in the
02b90beaSjason	 * following order:
02b90beaSjason	 *
02b90beaSjason	 *  - y = NaN.  Implied: if only one is a signalling NaN, y is.
02b90beaSjason	 *	The result is y.
02b90beaSjason	 *  - y = Inf.  Implied: x != NaN (is 0, number, or Inf: the NaN
02b90beaSjason	 *    case was taken care of earlier).
02b90beaSjason	 *	If x = 0, the result is NaN.  Otherwise the result
02b90beaSjason	 *	is y, with its sign reversed if x is negative.
02b90beaSjason	 *  - x = 0.  Implied: y is 0 or number.
02b90beaSjason	 *	The result is 0 (with XORed sign as usual).
02b90beaSjason	 *  - other.  Implied: both x and y are numbers.
02b90beaSjason	 *	The result is x * y (XOR sign, multiply bits, add exponents).
02b90beaSjason	 */
02b90beaSjason	ORDER(x, y);
02b90beaSjason	if (ISNAN(y)) {
02b90beaSjason		y->fp_sign ^= x->fp_sign;
02b90beaSjason		return (y);
02b90beaSjason	}
02b90beaSjason	if (ISINF(y)) {
02b90beaSjason		if (ISZERO(x))
02b90beaSjason			return (__fpu_newnan(fe));
02b90beaSjason		y->fp_sign ^= x->fp_sign;
02b90beaSjason		return (y);
02b90beaSjason	}
02b90beaSjason	if (ISZERO(x)) {
02b90beaSjason		x->fp_sign ^= y->fp_sign;
02b90beaSjason		return (x);
02b90beaSjason	}
02b90beaSjason
02b90beaSjason	/*
02b90beaSjason	 * Setup.  In the code below, the mask `m' will hold the current
02b90beaSjason	 * mantissa byte from y.  The variable `bit' denotes the bit
02b90beaSjason	 * within m.  We also define some macros to deal with everything.
02b90beaSjason	 */
02b90beaSjason	x3 = x->fp_mant[3];
02b90beaSjason	x2 = x->fp_mant[2];
02b90beaSjason	x1 = x->fp_mant[1];
02b90beaSjason	x0 = x->fp_mant[0];
02b90beaSjason	sticky = a3 = a2 = a1 = a0 = 0;
02b90beaSjason
02b90beaSjason#define	ADD	/* A += X */ \
02b90beaSjason	FPU_ADDS(a3, a3, x3); \
02b90beaSjason	FPU_ADDCS(a2, a2, x2); \
02b90beaSjason	FPU_ADDCS(a1, a1, x1); \
02b90beaSjason	FPU_ADDC(a0, a0, x0)
02b90beaSjason
02b90beaSjason#define	SHR1	/* A >>= 1, with sticky */ \
02b90beaSjason	sticky |= a3 & 1, a3 = (a3 >> 1) | (a2 << 31), \
02b90beaSjason	a2 = (a2 >> 1) | (a1 << 31), a1 = (a1 >> 1) | (a0 << 31), a0 >>= 1
02b90beaSjason
02b90beaSjason#define	SHR32	/* A >>= 32, with sticky */ \
02b90beaSjason	sticky |= a3, a3 = a2, a2 = a1, a1 = a0, a0 = 0
02b90beaSjason
02b90beaSjason#define	STEP	/* each 1-bit step of the multiplication */ \
02b90beaSjason	SHR1; if (bit & m) { ADD; }; bit <<= 1
02b90beaSjason
02b90beaSjason	/*
02b90beaSjason	 * We are ready to begin.  The multiply loop runs once for each
02b90beaSjason	 * of the four 32-bit words.  Some words, however, are special.
02b90beaSjason	 * As noted above, the low order bits of Y are often zero.  Even
02b90beaSjason	 * if not, the first loop can certainly skip the guard bits.
02b90beaSjason	 * The last word of y has its highest 1-bit in position FP_NMANT-1,
02b90beaSjason	 * so we stop the loop when we move past that bit.
02b90beaSjason	 */
02b90beaSjason	if ((m = y->fp_mant[3]) == 0) {
02b90beaSjason		/* SHR32; */			/* unneeded since A==0 */
02b90beaSjason	} else {
02b90beaSjason		bit = 1 << FP_NG;
02b90beaSjason		do {
02b90beaSjason			STEP;
02b90beaSjason		} while (bit != 0);
02b90beaSjason	}
02b90beaSjason	if ((m = y->fp_mant[2]) == 0) {
02b90beaSjason		SHR32;
02b90beaSjason	} else {
02b90beaSjason		bit = 1;
02b90beaSjason		do {
02b90beaSjason			STEP;
02b90beaSjason		} while (bit != 0);
02b90beaSjason	}
02b90beaSjason	if ((m = y->fp_mant[1]) == 0) {
02b90beaSjason		SHR32;
02b90beaSjason	} else {
02b90beaSjason		bit = 1;
02b90beaSjason		do {
02b90beaSjason			STEP;
02b90beaSjason		} while (bit != 0);
02b90beaSjason	}
02b90beaSjason	m = y->fp_mant[0];		/* definitely != 0 */
02b90beaSjason	bit = 1;
02b90beaSjason	do {
02b90beaSjason		STEP;
02b90beaSjason	} while (bit <= m);
02b90beaSjason
02b90beaSjason	/*
02b90beaSjason	 * Done with mantissa calculation.  Get exponent and handle
02b90beaSjason	 * 11.111...1 case, then put result in place.  We reuse x since
02b90beaSjason	 * it already has the right class (FP_NUM).
02b90beaSjason	 */
02b90beaSjason	m = x->fp_exp + y->fp_exp;
02b90beaSjason	if (a0 >= FP_2) {
02b90beaSjason		SHR1;
02b90beaSjason		m++;
02b90beaSjason	}
02b90beaSjason	x->fp_sign ^= y->fp_sign;
02b90beaSjason	x->fp_exp = m;
02b90beaSjason	x->fp_sticky = sticky;
02b90beaSjason	x->fp_mant[3] = a3;
02b90beaSjason	x->fp_mant[2] = a2;
02b90beaSjason	x->fp_mant[1] = a1;
02b90beaSjason	x->fp_mant[0] = a0;
02b90beaSjason	return (x);
02b90beaSjason}