1*54391Storek/* 2*54391Storek * Copyright (c) 1992 The Regents of the University of California. 3*54391Storek * All rights reserved. 4*54391Storek * 5*54391Storek * This software was developed by the Computer Systems Engineering group 6*54391Storek * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 7*54391Storek * contributed to Berkeley. 8*54391Storek * 9*54391Storek * %sccs.include.redist.c% 10*54391Storek * 11*54391Storek * from: $Header: umul.s,v 1.4 92/06/25 13:24:05 torek Exp $ 12*54391Storek */ 13*54391Storek 14*54391Storek#if defined(LIBC_SCCS) && !defined(lint) 15*54391Storek .asciz "@(#)umul.s 5.1 (Berkeley) 06/25/92" 16*54391Storek#endif /* LIBC_SCCS and not lint */ 17*54391Storek 18*54391Storek/* 19*54391Storek * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the 20*54391Storek * upper 32 bits of the 64-bit product). 21*54391Storek * 22*54391Storek * This code optimizes short (less than 13-bit) multiplies. Short 23*54391Storek * multiplies require 25 instruction cycles, and long ones require 24*54391Storek * 45 instruction cycles. 25*54391Storek * 26*54391Storek * On return, overflow has occurred (%o1 is not zero) if and only if 27*54391Storek * the Z condition code is clear, allowing, e.g., the following: 28*54391Storek * 29*54391Storek * call .umul 30*54391Storek * nop 31*54391Storek * bnz overflow (or tnz) 32*54391Storek */ 33*54391Storek 34*54391Storek#include "DEFS.h" 35*54391StorekFUNC(.umul) 36*54391Storek or %o0, %o1, %o4 37*54391Storek mov %o0, %y ! multiplier -> Y 38*54391Storek andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args 39*54391Storek be Lmul_shortway ! if zero, can do it the short way 40*54391Storek andcc %g0, %g0, %o4 ! zero the partial product and clear N and V 41*54391Storek 42*54391Storek /* 43*54391Storek * Long multiply. 32 steps, followed by a final shift step. 44*54391Storek */ 45*54391Storek mulscc %o4, %o1, %o4 ! 1 46*54391Storek mulscc %o4, %o1, %o4 ! 2 47*54391Storek mulscc %o4, %o1, %o4 ! 3 48*54391Storek mulscc %o4, %o1, %o4 ! 4 49*54391Storek mulscc %o4, %o1, %o4 ! 5 50*54391Storek mulscc %o4, %o1, %o4 ! 6 51*54391Storek mulscc %o4, %o1, %o4 ! 7 52*54391Storek mulscc %o4, %o1, %o4 ! 8 53*54391Storek mulscc %o4, %o1, %o4 ! 9 54*54391Storek mulscc %o4, %o1, %o4 ! 10 55*54391Storek mulscc %o4, %o1, %o4 ! 11 56*54391Storek mulscc %o4, %o1, %o4 ! 12 57*54391Storek mulscc %o4, %o1, %o4 ! 13 58*54391Storek mulscc %o4, %o1, %o4 ! 14 59*54391Storek mulscc %o4, %o1, %o4 ! 15 60*54391Storek mulscc %o4, %o1, %o4 ! 16 61*54391Storek mulscc %o4, %o1, %o4 ! 17 62*54391Storek mulscc %o4, %o1, %o4 ! 18 63*54391Storek mulscc %o4, %o1, %o4 ! 19 64*54391Storek mulscc %o4, %o1, %o4 ! 20 65*54391Storek mulscc %o4, %o1, %o4 ! 21 66*54391Storek mulscc %o4, %o1, %o4 ! 22 67*54391Storek mulscc %o4, %o1, %o4 ! 23 68*54391Storek mulscc %o4, %o1, %o4 ! 24 69*54391Storek mulscc %o4, %o1, %o4 ! 25 70*54391Storek mulscc %o4, %o1, %o4 ! 26 71*54391Storek mulscc %o4, %o1, %o4 ! 27 72*54391Storek mulscc %o4, %o1, %o4 ! 28 73*54391Storek mulscc %o4, %o1, %o4 ! 29 74*54391Storek mulscc %o4, %o1, %o4 ! 30 75*54391Storek mulscc %o4, %o1, %o4 ! 31 76*54391Storek mulscc %o4, %o1, %o4 ! 32 77*54391Storek mulscc %o4, %g0, %o4 ! final shift 78*54391Storek 79*54391Storek 80*54391Storek /* 81*54391Storek * Normally, with the shift-and-add approach, if both numbers are 82*54391Storek * positive you get the correct result. WIth 32-bit two's-complement 83*54391Storek * numbers, -x is represented as 84*54391Storek * 85*54391Storek * x 32 86*54391Storek * ( 2 - ------ ) mod 2 * 2 87*54391Storek * 32 88*54391Storek * 2 89*54391Storek * 90*54391Storek * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, 91*54391Storek * we can treat this as if the radix point were just to the left 92*54391Storek * of the sign bit (multiply by 2^32), and get 93*54391Storek * 94*54391Storek * -x = (2 - x) mod 2 95*54391Storek * 96*54391Storek * Then, ignoring the `mod 2's for convenience: 97*54391Storek * 98*54391Storek * x * y = xy 99*54391Storek * -x * y = 2y - xy 100*54391Storek * x * -y = 2x - xy 101*54391Storek * -x * -y = 4 - 2x - 2y + xy 102*54391Storek * 103*54391Storek * For signed multiplies, we subtract (x << 32) from the partial 104*54391Storek * product to fix this problem for negative multipliers (see mul.s). 105*54391Storek * Because of the way the shift into the partial product is calculated 106*54391Storek * (N xor V), this term is automatically removed for the multiplicand, 107*54391Storek * so we don't have to adjust. 108*54391Storek * 109*54391Storek * But for unsigned multiplies, the high order bit wasn't a sign bit, 110*54391Storek * and the correction is wrong. So for unsigned multiplies where the 111*54391Storek * high order bit is one, we end up with xy - (y << 32). To fix it 112*54391Storek * we add y << 32. 113*54391Storek */ 114*54391Storek tst %o1 115*54391Storek bl,a 1f ! if %o1 < 0 (high order bit = 1), 116*54391Storek add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) 117*54391Storek1: rd %y, %o0 ! get lower half of product 118*54391Storek retl 119*54391Storek addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 120*54391Storek 121*54391StorekLmul_shortway: 122*54391Storek /* 123*54391Storek * Short multiply. 12 steps, followed by a final shift step. 124*54391Storek * The resulting bits are off by 12 and (32-12) = 20 bit positions, 125*54391Storek * but there is no problem with %o0 being negative (unlike above), 126*54391Storek * and overflow is impossible (the answer is at most 24 bits long). 127*54391Storek */ 128*54391Storek mulscc %o4, %o1, %o4 ! 1 129*54391Storek mulscc %o4, %o1, %o4 ! 2 130*54391Storek mulscc %o4, %o1, %o4 ! 3 131*54391Storek mulscc %o4, %o1, %o4 ! 4 132*54391Storek mulscc %o4, %o1, %o4 ! 5 133*54391Storek mulscc %o4, %o1, %o4 ! 6 134*54391Storek mulscc %o4, %o1, %o4 ! 7 135*54391Storek mulscc %o4, %o1, %o4 ! 8 136*54391Storek mulscc %o4, %o1, %o4 ! 9 137*54391Storek mulscc %o4, %o1, %o4 ! 10 138*54391Storek mulscc %o4, %o1, %o4 ! 11 139*54391Storek mulscc %o4, %o1, %o4 ! 12 140*54391Storek mulscc %o4, %g0, %o4 ! final shift 141*54391Storek 142*54391Storek /* 143*54391Storek * %o4 has 20 of the bits that should be in the result; %y has 144*54391Storek * the bottom 12 (as %y's top 12). That is: 145*54391Storek * 146*54391Storek * %o4 %y 147*54391Storek * +----------------+----------------+ 148*54391Storek * | -12- | -20- | -12- | -20- | 149*54391Storek * +------(---------+------)---------+ 150*54391Storek * -----result----- 151*54391Storek * 152*54391Storek * The 12 bits of %o4 left of the `result' area are all zero; 153*54391Storek * in fact, all top 20 bits of %o4 are zero. 154*54391Storek */ 155*54391Storek 156*54391Storek rd %y, %o5 157*54391Storek sll %o4, 12, %o0 ! shift middle bits left 12 158*54391Storek srl %o5, 20, %o5 ! shift low bits right 20 159*54391Storek or %o5, %o0, %o0 160*54391Storek retl 161*54391Storek addcc %g0, %g0, %o1 ! %o1 = zero, and set Z 162