154391Storek/* 2*61172Sbostic * Copyright (c) 1992, 1993 3*61172Sbostic * The Regents of the University of California. All rights reserved. 454391Storek * 554391Storek * This software was developed by the Computer Systems Engineering group 654391Storek * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 754391Storek * contributed to Berkeley. 854391Storek * 954391Storek * %sccs.include.redist.c% 1054391Storek * 1154391Storek * from: $Header: umul.s,v 1.4 92/06/25 13:24:05 torek Exp $ 1254391Storek */ 1354391Storek 1454391Storek#if defined(LIBC_SCCS) && !defined(lint) 15*61172Sbostic .asciz "@(#)umul.s 8.1 (Berkeley) 06/04/93" 1654391Storek#endif /* LIBC_SCCS and not lint */ 1754391Storek 1854391Storek/* 1954391Storek * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the 2054391Storek * upper 32 bits of the 64-bit product). 2154391Storek * 2254391Storek * This code optimizes short (less than 13-bit) multiplies. Short 2354391Storek * multiplies require 25 instruction cycles, and long ones require 2454391Storek * 45 instruction cycles. 2554391Storek * 2654391Storek * On return, overflow has occurred (%o1 is not zero) if and only if 2754391Storek * the Z condition code is clear, allowing, e.g., the following: 2854391Storek * 2954391Storek * call .umul 3054391Storek * nop 3154391Storek * bnz overflow (or tnz) 3254391Storek */ 3354391Storek 3454391Storek#include "DEFS.h" 3554391StorekFUNC(.umul) 3654391Storek or %o0, %o1, %o4 3754391Storek mov %o0, %y ! multiplier -> Y 3854391Storek andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args 3954391Storek be Lmul_shortway ! if zero, can do it the short way 4054391Storek andcc %g0, %g0, %o4 ! zero the partial product and clear N and V 4154391Storek 4254391Storek /* 4354391Storek * Long multiply. 32 steps, followed by a final shift step. 4454391Storek */ 4554391Storek mulscc %o4, %o1, %o4 ! 1 4654391Storek mulscc %o4, %o1, %o4 ! 2 4754391Storek mulscc %o4, %o1, %o4 ! 3 4854391Storek mulscc %o4, %o1, %o4 ! 4 4954391Storek mulscc %o4, %o1, %o4 ! 5 5054391Storek mulscc %o4, %o1, %o4 ! 6 5154391Storek mulscc %o4, %o1, %o4 ! 7 5254391Storek mulscc %o4, %o1, %o4 ! 8 5354391Storek mulscc %o4, %o1, %o4 ! 9 5454391Storek mulscc %o4, %o1, %o4 ! 10 5554391Storek mulscc %o4, %o1, %o4 ! 11 5654391Storek mulscc %o4, %o1, %o4 ! 12 5754391Storek mulscc %o4, %o1, %o4 ! 13 5854391Storek mulscc %o4, %o1, %o4 ! 14 5954391Storek mulscc %o4, %o1, %o4 ! 15 6054391Storek mulscc %o4, %o1, %o4 ! 16 6154391Storek mulscc %o4, %o1, %o4 ! 17 6254391Storek mulscc %o4, %o1, %o4 ! 18 6354391Storek mulscc %o4, %o1, %o4 ! 19 6454391Storek mulscc %o4, %o1, %o4 ! 20 6554391Storek mulscc %o4, %o1, %o4 ! 21 6654391Storek mulscc %o4, %o1, %o4 ! 22 6754391Storek mulscc %o4, %o1, %o4 ! 23 6854391Storek mulscc %o4, %o1, %o4 ! 24 6954391Storek mulscc %o4, %o1, %o4 ! 25 7054391Storek mulscc %o4, %o1, %o4 ! 26 7154391Storek mulscc %o4, %o1, %o4 ! 27 7254391Storek mulscc %o4, %o1, %o4 ! 28 7354391Storek mulscc %o4, %o1, %o4 ! 29 7454391Storek mulscc %o4, %o1, %o4 ! 30 7554391Storek mulscc %o4, %o1, %o4 ! 31 7654391Storek mulscc %o4, %o1, %o4 ! 32 7754391Storek mulscc %o4, %g0, %o4 ! final shift 7854391Storek 7954391Storek 8054391Storek /* 8154391Storek * Normally, with the shift-and-add approach, if both numbers are 8254391Storek * positive you get the correct result. WIth 32-bit two's-complement 8354391Storek * numbers, -x is represented as 8454391Storek * 8554391Storek * x 32 8654391Storek * ( 2 - ------ ) mod 2 * 2 8754391Storek * 32 8854391Storek * 2 8954391Storek * 9054391Storek * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, 9154391Storek * we can treat this as if the radix point were just to the left 9254391Storek * of the sign bit (multiply by 2^32), and get 9354391Storek * 9454391Storek * -x = (2 - x) mod 2 9554391Storek * 9654391Storek * Then, ignoring the `mod 2's for convenience: 9754391Storek * 9854391Storek * x * y = xy 9954391Storek * -x * y = 2y - xy 10054391Storek * x * -y = 2x - xy 10154391Storek * -x * -y = 4 - 2x - 2y + xy 10254391Storek * 10354391Storek * For signed multiplies, we subtract (x << 32) from the partial 10454391Storek * product to fix this problem for negative multipliers (see mul.s). 10554391Storek * Because of the way the shift into the partial product is calculated 10654391Storek * (N xor V), this term is automatically removed for the multiplicand, 10754391Storek * so we don't have to adjust. 10854391Storek * 10954391Storek * But for unsigned multiplies, the high order bit wasn't a sign bit, 11054391Storek * and the correction is wrong. So for unsigned multiplies where the 11154391Storek * high order bit is one, we end up with xy - (y << 32). To fix it 11254391Storek * we add y << 32. 11354391Storek */ 11454391Storek tst %o1 11554391Storek bl,a 1f ! if %o1 < 0 (high order bit = 1), 11654391Storek add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) 11754391Storek1: rd %y, %o0 ! get lower half of product 11854391Storek retl 11954391Storek addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 12054391Storek 12154391StorekLmul_shortway: 12254391Storek /* 12354391Storek * Short multiply. 12 steps, followed by a final shift step. 12454391Storek * The resulting bits are off by 12 and (32-12) = 20 bit positions, 12554391Storek * but there is no problem with %o0 being negative (unlike above), 12654391Storek * and overflow is impossible (the answer is at most 24 bits long). 12754391Storek */ 12854391Storek mulscc %o4, %o1, %o4 ! 1 12954391Storek mulscc %o4, %o1, %o4 ! 2 13054391Storek mulscc %o4, %o1, %o4 ! 3 13154391Storek mulscc %o4, %o1, %o4 ! 4 13254391Storek mulscc %o4, %o1, %o4 ! 5 13354391Storek mulscc %o4, %o1, %o4 ! 6 13454391Storek mulscc %o4, %o1, %o4 ! 7 13554391Storek mulscc %o4, %o1, %o4 ! 8 13654391Storek mulscc %o4, %o1, %o4 ! 9 13754391Storek mulscc %o4, %o1, %o4 ! 10 13854391Storek mulscc %o4, %o1, %o4 ! 11 13954391Storek mulscc %o4, %o1, %o4 ! 12 14054391Storek mulscc %o4, %g0, %o4 ! final shift 14154391Storek 14254391Storek /* 14354391Storek * %o4 has 20 of the bits that should be in the result; %y has 14454391Storek * the bottom 12 (as %y's top 12). That is: 14554391Storek * 14654391Storek * %o4 %y 14754391Storek * +----------------+----------------+ 14854391Storek * | -12- | -20- | -12- | -20- | 14954391Storek * +------(---------+------)---------+ 15054391Storek * -----result----- 15154391Storek * 15254391Storek * The 12 bits of %o4 left of the `result' area are all zero; 15354391Storek * in fact, all top 20 bits of %o4 are zero. 15454391Storek */ 15554391Storek 15654391Storek rd %y, %o5 15754391Storek sll %o4, 12, %o0 ! shift middle bits left 12 15854391Storek srl %o5, 20, %o5 ! shift low bits right 20 15954391Storek or %o5, %o0, %o0 16054391Storek retl 16154391Storek addcc %g0, %g0, %o1 ! %o1 = zero, and set Z 162