1*b6cbf720SGianluca Guida/* $NetBSD: umul.S,v 1.1 2005/12/20 19:28:50 christos Exp $ */ 2*b6cbf720SGianluca Guida 3*b6cbf720SGianluca Guida/* 4*b6cbf720SGianluca Guida * Copyright (c) 1992, 1993 5*b6cbf720SGianluca Guida * The Regents of the University of California. All rights reserved. 6*b6cbf720SGianluca Guida * 7*b6cbf720SGianluca Guida * This software was developed by the Computer Systems Engineering group 8*b6cbf720SGianluca Guida * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 9*b6cbf720SGianluca Guida * contributed to Berkeley. 10*b6cbf720SGianluca Guida * 11*b6cbf720SGianluca Guida * Redistribution and use in source and binary forms, with or without 12*b6cbf720SGianluca Guida * modification, are permitted provided that the following conditions 13*b6cbf720SGianluca Guida * are met: 14*b6cbf720SGianluca Guida * 1. Redistributions of source code must retain the above copyright 15*b6cbf720SGianluca Guida * notice, this list of conditions and the following disclaimer. 16*b6cbf720SGianluca Guida * 2. Redistributions in binary form must reproduce the above copyright 17*b6cbf720SGianluca Guida * notice, this list of conditions and the following disclaimer in the 18*b6cbf720SGianluca Guida * documentation and/or other materials provided with the distribution. 19*b6cbf720SGianluca Guida * 3. Neither the name of the University nor the names of its contributors 20*b6cbf720SGianluca Guida * may be used to endorse or promote products derived from this software 21*b6cbf720SGianluca Guida * without specific prior written permission. 22*b6cbf720SGianluca Guida * 23*b6cbf720SGianluca Guida * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24*b6cbf720SGianluca Guida * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25*b6cbf720SGianluca Guida * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26*b6cbf720SGianluca Guida * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27*b6cbf720SGianluca Guida * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28*b6cbf720SGianluca Guida * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29*b6cbf720SGianluca Guida * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30*b6cbf720SGianluca Guida * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31*b6cbf720SGianluca Guida * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32*b6cbf720SGianluca Guida * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33*b6cbf720SGianluca Guida * SUCH DAMAGE. 34*b6cbf720SGianluca Guida * 35*b6cbf720SGianluca Guida * from: Header: umul.s,v 1.4 92/06/25 13:24:05 torek Exp 36*b6cbf720SGianluca Guida */ 37*b6cbf720SGianluca Guida 38*b6cbf720SGianluca Guida#include <machine/asm.h> 39*b6cbf720SGianluca Guida#if defined(LIBC_SCCS) && !defined(lint) 40*b6cbf720SGianluca Guida#if 0 41*b6cbf720SGianluca Guida .asciz "@(#)umul.s 8.1 (Berkeley) 6/4/93" 42*b6cbf720SGianluca Guida#else 43*b6cbf720SGianluca Guida RCSID("$NetBSD: umul.S,v 1.1 2005/12/20 19:28:50 christos Exp $") 44*b6cbf720SGianluca Guida#endif 45*b6cbf720SGianluca Guida#endif /* LIBC_SCCS and not lint */ 46*b6cbf720SGianluca Guida 47*b6cbf720SGianluca Guida/* 48*b6cbf720SGianluca Guida * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the 49*b6cbf720SGianluca Guida * upper 32 bits of the 64-bit product). 50*b6cbf720SGianluca Guida * 51*b6cbf720SGianluca Guida * This code optimizes short (less than 13-bit) multiplies. Short 52*b6cbf720SGianluca Guida * multiplies require 25 instruction cycles, and long ones require 53*b6cbf720SGianluca Guida * 45 instruction cycles. 54*b6cbf720SGianluca Guida * 55*b6cbf720SGianluca Guida * On return, overflow has occurred (%o1 is not zero) if and only if 56*b6cbf720SGianluca Guida * the Z condition code is clear, allowing, e.g., the following: 57*b6cbf720SGianluca Guida * 58*b6cbf720SGianluca Guida * call .umul 59*b6cbf720SGianluca Guida * nop 60*b6cbf720SGianluca Guida * bnz overflow (or tnz) 61*b6cbf720SGianluca Guida */ 62*b6cbf720SGianluca Guida 63*b6cbf720SGianluca GuidaFUNC(.umul) 64*b6cbf720SGianluca Guida or %o0, %o1, %o4 65*b6cbf720SGianluca Guida mov %o0, %y ! multiplier -> Y 66*b6cbf720SGianluca Guida andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args 67*b6cbf720SGianluca Guida be Lmul_shortway ! if zero, can do it the short way 68*b6cbf720SGianluca Guida andcc %g0, %g0, %o4 ! zero the partial product and clear N and V 69*b6cbf720SGianluca Guida 70*b6cbf720SGianluca Guida /* 71*b6cbf720SGianluca Guida * Long multiply. 32 steps, followed by a final shift step. 72*b6cbf720SGianluca Guida */ 73*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 1 74*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 2 75*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 3 76*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 4 77*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 5 78*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 6 79*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 7 80*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 8 81*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 9 82*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 10 83*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 11 84*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 12 85*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 13 86*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 14 87*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 15 88*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 16 89*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 17 90*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 18 91*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 19 92*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 20 93*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 21 94*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 22 95*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 23 96*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 24 97*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 25 98*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 26 99*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 27 100*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 28 101*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 29 102*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 30 103*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 31 104*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 32 105*b6cbf720SGianluca Guida mulscc %o4, %g0, %o4 ! final shift 106*b6cbf720SGianluca Guida 107*b6cbf720SGianluca Guida 108*b6cbf720SGianluca Guida /* 109*b6cbf720SGianluca Guida * Normally, with the shift-and-add approach, if both numbers are 110*b6cbf720SGianluca Guida * positive you get the correct result. WIth 32-bit two's-complement 111*b6cbf720SGianluca Guida * numbers, -x is represented as 112*b6cbf720SGianluca Guida * 113*b6cbf720SGianluca Guida * x 32 114*b6cbf720SGianluca Guida * ( 2 - ------ ) mod 2 * 2 115*b6cbf720SGianluca Guida * 32 116*b6cbf720SGianluca Guida * 2 117*b6cbf720SGianluca Guida * 118*b6cbf720SGianluca Guida * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, 119*b6cbf720SGianluca Guida * we can treat this as if the radix point were just to the left 120*b6cbf720SGianluca Guida * of the sign bit (multiply by 2^32), and get 121*b6cbf720SGianluca Guida * 122*b6cbf720SGianluca Guida * -x = (2 - x) mod 2 123*b6cbf720SGianluca Guida * 124*b6cbf720SGianluca Guida * Then, ignoring the `mod 2's for convenience: 125*b6cbf720SGianluca Guida * 126*b6cbf720SGianluca Guida * x * y = xy 127*b6cbf720SGianluca Guida * -x * y = 2y - xy 128*b6cbf720SGianluca Guida * x * -y = 2x - xy 129*b6cbf720SGianluca Guida * -x * -y = 4 - 2x - 2y + xy 130*b6cbf720SGianluca Guida * 131*b6cbf720SGianluca Guida * For signed multiplies, we subtract (x << 32) from the partial 132*b6cbf720SGianluca Guida * product to fix this problem for negative multipliers (see mul.s). 133*b6cbf720SGianluca Guida * Because of the way the shift into the partial product is calculated 134*b6cbf720SGianluca Guida * (N xor V), this term is automatically removed for the multiplicand, 135*b6cbf720SGianluca Guida * so we don't have to adjust. 136*b6cbf720SGianluca Guida * 137*b6cbf720SGianluca Guida * But for unsigned multiplies, the high order bit wasn't a sign bit, 138*b6cbf720SGianluca Guida * and the correction is wrong. So for unsigned multiplies where the 139*b6cbf720SGianluca Guida * high order bit is one, we end up with xy - (y << 32). To fix it 140*b6cbf720SGianluca Guida * we add y << 32. 141*b6cbf720SGianluca Guida */ 142*b6cbf720SGianluca Guida tst %o1 143*b6cbf720SGianluca Guida bl,a 1f ! if %o1 < 0 (high order bit = 1), 144*b6cbf720SGianluca Guida add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) 145*b6cbf720SGianluca Guida1: rd %y, %o0 ! get lower half of product 146*b6cbf720SGianluca Guida retl 147*b6cbf720SGianluca Guida addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 148*b6cbf720SGianluca Guida 149*b6cbf720SGianluca GuidaLmul_shortway: 150*b6cbf720SGianluca Guida /* 151*b6cbf720SGianluca Guida * Short multiply. 12 steps, followed by a final shift step. 152*b6cbf720SGianluca Guida * The resulting bits are off by 12 and (32-12) = 20 bit positions, 153*b6cbf720SGianluca Guida * but there is no problem with %o0 being negative (unlike above), 154*b6cbf720SGianluca Guida * and overflow is impossible (the answer is at most 24 bits long). 155*b6cbf720SGianluca Guida */ 156*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 1 157*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 2 158*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 3 159*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 4 160*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 5 161*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 6 162*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 7 163*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 8 164*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 9 165*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 10 166*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 11 167*b6cbf720SGianluca Guida mulscc %o4, %o1, %o4 ! 12 168*b6cbf720SGianluca Guida mulscc %o4, %g0, %o4 ! final shift 169*b6cbf720SGianluca Guida 170*b6cbf720SGianluca Guida /* 171*b6cbf720SGianluca Guida * %o4 has 20 of the bits that should be in the result; %y has 172*b6cbf720SGianluca Guida * the bottom 12 (as %y's top 12). That is: 173*b6cbf720SGianluca Guida * 174*b6cbf720SGianluca Guida * %o4 %y 175*b6cbf720SGianluca Guida * +----------------+----------------+ 176*b6cbf720SGianluca Guida * | -12- | -20- | -12- | -20- | 177*b6cbf720SGianluca Guida * +------(---------+------)---------+ 178*b6cbf720SGianluca Guida * -----result----- 179*b6cbf720SGianluca Guida * 180*b6cbf720SGianluca Guida * The 12 bits of %o4 left of the `result' area are all zero; 181*b6cbf720SGianluca Guida * in fact, all top 20 bits of %o4 are zero. 182*b6cbf720SGianluca Guida */ 183*b6cbf720SGianluca Guida 184*b6cbf720SGianluca Guida rd %y, %o5 185*b6cbf720SGianluca Guida sll %o4, 12, %o0 ! shift middle bits left 12 186*b6cbf720SGianluca Guida srl %o5, 20, %o5 ! shift low bits right 20 187*b6cbf720SGianluca Guida or %o5, %o0, %o0 188*b6cbf720SGianluca Guida retl 189*b6cbf720SGianluca Guida addcc %g0, %g0, %o1 ! %o1 = zero, and set Z 190