1*54391Storek/* 2*54391Storek * Copyright (c) 1992 The Regents of the University of California. 3*54391Storek * All rights reserved. 4*54391Storek * 5*54391Storek * This software was developed by the Computer Systems Engineering group 6*54391Storek * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 7*54391Storek * contributed to Berkeley. 8*54391Storek * 9*54391Storek * %sccs.include.redist.c% 10*54391Storek * 11*54391Storek * from: $Header: mul.s,v 1.5 92/06/25 13:24:03 torek Exp $ 12*54391Storek */ 13*54391Storek 14*54391Storek#if defined(LIBC_SCCS) && !defined(lint) 15*54391Storek .asciz "@(#)mul.s 5.1 (Berkeley) 06/25/92" 16*54391Storek#endif /* LIBC_SCCS and not lint */ 17*54391Storek 18*54391Storek/* 19*54391Storek * Signed multiply, from Appendix E of the Sparc Version 8 20*54391Storek * Architecture Manual. 21*54391Storek * 22*54391Storek * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of 23*54391Storek * the 64-bit product). 24*54391Storek * 25*54391Storek * This code optimizes short (less than 13-bit) multiplies. 26*54391Storek */ 27*54391Storek 28*54391Storek#include "DEFS.h" 29*54391StorekFUNC(.mul) 30*54391Storek mov %o0, %y ! multiplier -> Y 31*54391Storek andncc %o0, 0xfff, %g0 ! test bits 12..31 32*54391Storek be Lmul_shortway ! if zero, can do it the short way 33*54391Storek andcc %g0, %g0, %o4 ! zero the partial product and clear N and V 34*54391Storek 35*54391Storek /* 36*54391Storek * Long multiply. 32 steps, followed by a final shift step. 37*54391Storek */ 38*54391Storek mulscc %o4, %o1, %o4 ! 1 39*54391Storek mulscc %o4, %o1, %o4 ! 2 40*54391Storek mulscc %o4, %o1, %o4 ! 3 41*54391Storek mulscc %o4, %o1, %o4 ! 4 42*54391Storek mulscc %o4, %o1, %o4 ! 5 43*54391Storek mulscc %o4, %o1, %o4 ! 6 44*54391Storek mulscc %o4, %o1, %o4 ! 7 45*54391Storek mulscc %o4, %o1, %o4 ! 8 46*54391Storek mulscc %o4, %o1, %o4 ! 9 47*54391Storek mulscc %o4, %o1, %o4 ! 10 48*54391Storek mulscc %o4, %o1, %o4 ! 11 49*54391Storek mulscc %o4, %o1, %o4 ! 12 50*54391Storek mulscc %o4, %o1, %o4 ! 13 51*54391Storek mulscc %o4, %o1, %o4 ! 14 52*54391Storek mulscc %o4, %o1, %o4 ! 15 53*54391Storek mulscc %o4, %o1, %o4 ! 16 54*54391Storek mulscc %o4, %o1, %o4 ! 17 55*54391Storek mulscc %o4, %o1, %o4 ! 18 56*54391Storek mulscc %o4, %o1, %o4 ! 19 57*54391Storek mulscc %o4, %o1, %o4 ! 20 58*54391Storek mulscc %o4, %o1, %o4 ! 21 59*54391Storek mulscc %o4, %o1, %o4 ! 22 60*54391Storek mulscc %o4, %o1, %o4 ! 23 61*54391Storek mulscc %o4, %o1, %o4 ! 24 62*54391Storek mulscc %o4, %o1, %o4 ! 25 63*54391Storek mulscc %o4, %o1, %o4 ! 26 64*54391Storek mulscc %o4, %o1, %o4 ! 27 65*54391Storek mulscc %o4, %o1, %o4 ! 28 66*54391Storek mulscc %o4, %o1, %o4 ! 29 67*54391Storek mulscc %o4, %o1, %o4 ! 30 68*54391Storek mulscc %o4, %o1, %o4 ! 31 69*54391Storek mulscc %o4, %o1, %o4 ! 32 70*54391Storek mulscc %o4, %g0, %o4 ! final shift 71*54391Storek 72*54391Storek ! If %o0 was negative, the result is 73*54391Storek ! (%o0 * %o1) + (%o1 << 32)) 74*54391Storek ! We fix that here. 75*54391Storek 76*54391Storek tst %o0 77*54391Storek bge 1f 78*54391Storek rd %y, %o0 79*54391Storek 80*54391Storek ! %o0 was indeed negative; fix upper 32 bits of result by subtracting 81*54391Storek ! %o1 (i.e., return %o4 - %o1 in %o1). 82*54391Storek retl 83*54391Storek sub %o4, %o1, %o1 84*54391Storek 85*54391Storek1: 86*54391Storek retl 87*54391Storek mov %o4, %o1 88*54391Storek 89*54391StorekLmul_shortway: 90*54391Storek /* 91*54391Storek * Short multiply. 12 steps, followed by a final shift step. 92*54391Storek * The resulting bits are off by 12 and (32-12) = 20 bit positions, 93*54391Storek * but there is no problem with %o0 being negative (unlike above). 94*54391Storek */ 95*54391Storek mulscc %o4, %o1, %o4 ! 1 96*54391Storek mulscc %o4, %o1, %o4 ! 2 97*54391Storek mulscc %o4, %o1, %o4 ! 3 98*54391Storek mulscc %o4, %o1, %o4 ! 4 99*54391Storek mulscc %o4, %o1, %o4 ! 5 100*54391Storek mulscc %o4, %o1, %o4 ! 6 101*54391Storek mulscc %o4, %o1, %o4 ! 7 102*54391Storek mulscc %o4, %o1, %o4 ! 8 103*54391Storek mulscc %o4, %o1, %o4 ! 9 104*54391Storek mulscc %o4, %o1, %o4 ! 10 105*54391Storek mulscc %o4, %o1, %o4 ! 11 106*54391Storek mulscc %o4, %o1, %o4 ! 12 107*54391Storek mulscc %o4, %g0, %o4 ! final shift 108*54391Storek 109*54391Storek /* 110*54391Storek * %o4 has 20 of the bits that should be in the low part of the 111*54391Storek * result; %y has the bottom 12 (as %y's top 12). That is: 112*54391Storek * 113*54391Storek * %o4 %y 114*54391Storek * +----------------+----------------+ 115*54391Storek * | -12- | -20- | -12- | -20- | 116*54391Storek * +------(---------+------)---------+ 117*54391Storek * --hi-- ----low-part---- 118*54391Storek * 119*54391Storek * The upper 12 bits of %o4 should be sign-extended to form the 120*54391Storek * high part of the product (i.e., highpart = %o4 >> 20). 121*54391Storek */ 122*54391Storek 123*54391Storek rd %y, %o5 124*54391Storek sll %o4, 12, %o0 ! shift middle bits left 12 125*54391Storek srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left 126*54391Storek or %o5, %o0, %o0 ! construct low part of result 127*54391Storek retl 128*54391Storek sra %o4, 20, %o1 ! ... and extract high part of result 129