10b9f5089Scgd/* 20b9f5089Scgd * Copyright (c) 1992, 1993 30b9f5089Scgd * The Regents of the University of California. All rights reserved. 40b9f5089Scgd * 50b9f5089Scgd * This software was developed by the Computer Systems Engineering group 60b9f5089Scgd * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 70b9f5089Scgd * contributed to Berkeley. 80b9f5089Scgd * 90b9f5089Scgd * Redistribution and use in source and binary forms, with or without 100b9f5089Scgd * modification, are permitted provided that the following conditions 110b9f5089Scgd * are met: 120b9f5089Scgd * 1. Redistributions of source code must retain the above copyright 130b9f5089Scgd * notice, this list of conditions and the following disclaimer. 140b9f5089Scgd * 2. Redistributions in binary form must reproduce the above copyright 150b9f5089Scgd * notice, this list of conditions and the following disclaimer in the 160b9f5089Scgd * documentation and/or other materials provided with the distribution. 17eb7c1594Sagc * 3. Neither the name of the University nor the names of its contributors 180b9f5089Scgd * may be used to endorse or promote products derived from this software 190b9f5089Scgd * without specific prior written permission. 200b9f5089Scgd * 210b9f5089Scgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 220b9f5089Scgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 230b9f5089Scgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 240b9f5089Scgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 250b9f5089Scgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 260b9f5089Scgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 270b9f5089Scgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 280b9f5089Scgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 290b9f5089Scgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 300b9f5089Scgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 310b9f5089Scgd * SUCH DAMAGE. 320b9f5089Scgd * 330b9f5089Scgd * from: Header: divrem.m4,v 1.4 92/06/25 13:23:57 torek Exp 34*d2c61541Smartin * $NetBSD: divrem.m4,v 1.6 2011/03/23 20:54:35 martin Exp $ 350b9f5089Scgd */ 360b9f5089Scgd 370b9f5089Scgd/* 380b9f5089Scgd * Division and remainder, from Appendix E of the Sparc Version 8 390b9f5089Scgd * Architecture Manual, with fixes from Gordon Irlam. 400b9f5089Scgd */ 410b9f5089Scgd 420b9f5089Scgd#if defined(LIBC_SCCS) && !defined(lint) 430b9f5089Scgd .asciz "@(#)divrem.m4 8.1 (Berkeley) 6/4/93" 440b9f5089Scgd#endif /* LIBC_SCCS and not lint */ 450b9f5089Scgd 460b9f5089Scgd/* 470b9f5089Scgd * Input: dividend and divisor in %o0 and %o1 respectively. 480b9f5089Scgd * 490b9f5089Scgd * m4 parameters: 500b9f5089Scgd * NAME name of function to generate 510b9f5089Scgd * OP OP=div => %o0 / %o1; OP=rem => %o0 % %o1 520b9f5089Scgd * S S=true => signed; S=false => unsigned 530b9f5089Scgd * 540b9f5089Scgd * Algorithm parameters: 550b9f5089Scgd * N how many bits per iteration we try to get (4) 560b9f5089Scgd * WORDSIZE total number of bits (32) 570b9f5089Scgd * 580b9f5089Scgd * Derived constants: 590b9f5089Scgd * TWOSUPN 2^N, for label generation (m4 exponentiation currently broken) 600b9f5089Scgd * TOPBITS number of bits in the top `decade' of a number 610b9f5089Scgd * 620b9f5089Scgd * Important variables: 630b9f5089Scgd * Q the partial quotient under development (initially 0) 640b9f5089Scgd * R the remainder so far, initially the dividend 650b9f5089Scgd * ITER number of main division loop iterations required; 660b9f5089Scgd * equal to ceil(log2(quotient) / N). Note that this 670b9f5089Scgd * is the log base (2^N) of the quotient. 680b9f5089Scgd * V the current comparand, initially divisor*2^(ITER*N-1) 690b9f5089Scgd * 700b9f5089Scgd * Cost: 710b9f5089Scgd * Current estimate for non-large dividend is 720b9f5089Scgd * ceil(log2(quotient) / N) * (10 + 7N/2) + C 730b9f5089Scgd * A large dividend is one greater than 2^(31-TOPBITS) and takes a 740b9f5089Scgd * different path, as the upper bits of the quotient must be developed 750b9f5089Scgd * one bit at a time. 760b9f5089Scgd */ 770b9f5089Scgd 780b9f5089Scgddefine(N, `4') 790b9f5089Scgddefine(TWOSUPN, `16') 800b9f5089Scgddefine(WORDSIZE, `32') 810b9f5089Scgddefine(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N))) 820b9f5089Scgd 830b9f5089Scgddefine(dividend, `%o0') 840b9f5089Scgddefine(divisor, `%o1') 850b9f5089Scgddefine(Q, `%o2') 860b9f5089Scgddefine(R, `%o3') 870b9f5089Scgddefine(ITER, `%o4') 880b9f5089Scgddefine(V, `%o5') 890b9f5089Scgd 900b9f5089Scgd/* m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d */ 910b9f5089Scgddefine(T, `%g1') 92*d2c61541Smartindefine(SC, `%g5') 930b9f5089Scgdifelse(S, `true', `define(SIGN, `%g6')') 940b9f5089Scgd 950b9f5089Scgd/* 960b9f5089Scgd * This is the recursive definition for developing quotient digits. 970b9f5089Scgd * 980b9f5089Scgd * Parameters: 990b9f5089Scgd * $1 the current depth, 1 <= $1 <= N 1000b9f5089Scgd * $2 the current accumulation of quotient bits 1010b9f5089Scgd * N max depth 1020b9f5089Scgd * 1030b9f5089Scgd * We add a new bit to $2 and either recurse or insert the bits in 1040b9f5089Scgd * the quotient. R, Q, and V are inputs and outputs as defined above; 1050b9f5089Scgd * the condition codes are expected to reflect the input R, and are 1060b9f5089Scgd * modified to reflect the output R. 1070b9f5089Scgd */ 1080b9f5089Scgddefine(DEVELOP_QUOTIENT_BITS, 1090b9f5089Scgd` ! depth $1, accumulated bits $2 1100b9f5089Scgd bl L.$1.eval(TWOSUPN+$2) 1110b9f5089Scgd srl V,1,V 1120b9f5089Scgd ! remainder is positive 1130b9f5089Scgd subcc R,V,R 1140b9f5089Scgd ifelse($1, N, 1150b9f5089Scgd ` b 9f 1160b9f5089Scgd add Q, ($2*2+1), Q 1170b9f5089Scgd ', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')') 1180b9f5089ScgdL.$1.eval(TWOSUPN+$2): 1190b9f5089Scgd ! remainder is negative 1200b9f5089Scgd addcc R,V,R 1210b9f5089Scgd ifelse($1, N, 1220b9f5089Scgd ` b 9f 1230b9f5089Scgd add Q, ($2*2-1), Q 1240b9f5089Scgd ', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')') 1250b9f5089Scgd ifelse($1, 1, `9:')') 1260b9f5089Scgd 127368d2cb7Smrg#include <machine/asm.h> 1280b9f5089Scgd#include <machine/trap.h> 1290b9f5089Scgd 1300b9f5089ScgdFUNC(NAME) 1310b9f5089Scgdifelse(S, `true', 1320b9f5089Scgd` ! compute sign of result; if neither is negative, no problem 1330b9f5089Scgd orcc divisor, dividend, %g0 ! either negative? 1340b9f5089Scgd bge 2f ! no, go do the divide 135a3fa6016Spk ifelse(OP, `div', 136a3fa6016Spk `xor divisor, dividend, SIGN', 137a3fa6016Spk `mov dividend, SIGN') ! compute sign in any case 1380b9f5089Scgd tst divisor 1390b9f5089Scgd bge 1f 1400b9f5089Scgd tst dividend 1410b9f5089Scgd ! divisor is definitely negative; dividend might also be negative 1420b9f5089Scgd bge 2f ! if dividend not negative... 1430b9f5089Scgd neg divisor ! in any case, make divisor nonneg 1440b9f5089Scgd1: ! dividend is negative, divisor is nonnegative 1450b9f5089Scgd neg dividend ! make dividend nonnegative 1460b9f5089Scgd2: 1470b9f5089Scgd') 1480b9f5089Scgd ! Ready to divide. Compute size of quotient; scale comparand. 1490b9f5089Scgd orcc divisor, %g0, V 1500b9f5089Scgd bnz 1f 1510b9f5089Scgd mov dividend, R 1520b9f5089Scgd 1530b9f5089Scgd ! Divide by zero trap. If it returns, return 0 (about as 1540b9f5089Scgd ! wrong as possible, but that is what SunOS does...). 1550b9f5089Scgd t ST_DIV0 1560b9f5089Scgd retl 1570b9f5089Scgd clr %o0 1580b9f5089Scgd 1590b9f5089Scgd1: 1600b9f5089Scgd cmp R, V ! if divisor exceeds dividend, done 1610b9f5089Scgd blu Lgot_result ! (and algorithm fails otherwise) 1620b9f5089Scgd clr Q 1630b9f5089Scgd sethi %hi(1 << (WORDSIZE - TOPBITS - 1)), T 1640b9f5089Scgd cmp R, T 1650b9f5089Scgd blu Lnot_really_big 1660b9f5089Scgd clr ITER 1670b9f5089Scgd 1680b9f5089Scgd ! `Here the dividend is >= 2^(31-N) or so. We must be careful here, 1690b9f5089Scgd ! as our usual N-at-a-shot divide step will cause overflow and havoc. 1700b9f5089Scgd ! The number of bits in the result here is N*ITER+SC, where SC <= N. 1710b9f5089Scgd ! Compute ITER in an unorthodox manner: know we need to shift V into 1720b9f5089Scgd ! the top decade: so do not even bother to compare to R.' 1730b9f5089Scgd 1: 1740b9f5089Scgd cmp V, T 1750b9f5089Scgd bgeu 3f 1760b9f5089Scgd mov 1, SC 1770b9f5089Scgd sll V, N, V 1780b9f5089Scgd b 1b 1790b9f5089Scgd inc ITER 1800b9f5089Scgd 1810b9f5089Scgd ! Now compute SC. 1820b9f5089Scgd 2: addcc V, V, V 1830b9f5089Scgd bcc Lnot_too_big 1840b9f5089Scgd inc SC 1850b9f5089Scgd 1860b9f5089Scgd ! We get here if the divisor overflowed while shifting. 1870b9f5089Scgd ! This means that R has the high-order bit set. 1880b9f5089Scgd ! Restore V and subtract from R. 1890b9f5089Scgd sll T, TOPBITS, T ! high order bit 1900b9f5089Scgd srl V, 1, V ! rest of V 1910b9f5089Scgd add V, T, V 1920b9f5089Scgd b Ldo_single_div 1930b9f5089Scgd dec SC 1940b9f5089Scgd 1950b9f5089Scgd Lnot_too_big: 1960b9f5089Scgd 3: cmp V, R 1970b9f5089Scgd blu 2b 1980b9f5089Scgd nop 1990b9f5089Scgd be Ldo_single_div 2000b9f5089Scgd nop 2010b9f5089Scgd /* NB: these are commented out in the V8-Sparc manual as well */ 2020b9f5089Scgd /* (I do not understand this) */ 2030b9f5089Scgd ! V > R: went too far: back up 1 step 2040b9f5089Scgd ! srl V, 1, V 2050b9f5089Scgd ! dec SC 2060b9f5089Scgd ! do single-bit divide steps 2070b9f5089Scgd ! 2080b9f5089Scgd ! We have to be careful here. We know that R >= V, so we can do the 2090b9f5089Scgd ! first divide step without thinking. BUT, the others are conditional, 2100b9f5089Scgd ! and are only done if R >= 0. Because both R and V may have the high- 2110b9f5089Scgd ! order bit set in the first step, just falling into the regular 2120b9f5089Scgd ! division loop will mess up the first time around. 2130b9f5089Scgd ! So we unroll slightly... 2140b9f5089Scgd Ldo_single_div: 2150b9f5089Scgd deccc SC 2160b9f5089Scgd bl Lend_regular_divide 2170b9f5089Scgd nop 2180b9f5089Scgd sub R, V, R 2190b9f5089Scgd mov 1, Q 2200b9f5089Scgd b Lend_single_divloop 2210b9f5089Scgd nop 2220b9f5089Scgd Lsingle_divloop: 2230b9f5089Scgd sll Q, 1, Q 2240b9f5089Scgd bl 1f 2250b9f5089Scgd srl V, 1, V 2260b9f5089Scgd ! R >= 0 2270b9f5089Scgd sub R, V, R 2280b9f5089Scgd b 2f 2290b9f5089Scgd inc Q 2300b9f5089Scgd 1: ! R < 0 2310b9f5089Scgd add R, V, R 2320b9f5089Scgd dec Q 2330b9f5089Scgd 2: 2340b9f5089Scgd Lend_single_divloop: 2350b9f5089Scgd deccc SC 2360b9f5089Scgd bge Lsingle_divloop 2370b9f5089Scgd tst R 2380b9f5089Scgd b,a Lend_regular_divide 2390b9f5089Scgd 2400b9f5089ScgdLnot_really_big: 2410b9f5089Scgd1: 2420b9f5089Scgd sll V, N, V 2430b9f5089Scgd cmp V, R 2440b9f5089Scgd bleu 1b 2450b9f5089Scgd inccc ITER 2460b9f5089Scgd be Lgot_result 2470b9f5089Scgd dec ITER 2480b9f5089Scgd 2490b9f5089Scgd tst R ! set up for initial iteration 2500b9f5089ScgdLdivloop: 2510b9f5089Scgd sll Q, N, Q 2520b9f5089Scgd DEVELOP_QUOTIENT_BITS(1, 0) 2530b9f5089ScgdLend_regular_divide: 2540b9f5089Scgd deccc ITER 2550b9f5089Scgd bge Ldivloop 2560b9f5089Scgd tst R 2570b9f5089Scgd bl,a Lgot_result 2580b9f5089Scgd ! non-restoring fixup here (one instruction only!) 2590b9f5089Scgdifelse(OP, `div', 2600b9f5089Scgd` dec Q 2610b9f5089Scgd', ` add R, divisor, R 2620b9f5089Scgd') 2630b9f5089Scgd 2640b9f5089ScgdLgot_result: 2650b9f5089Scgdifelse(S, `true', 2660b9f5089Scgd` ! check to see if answer should be < 0 2670b9f5089Scgd tst SIGN 2680b9f5089Scgd bl,a 1f 2690b9f5089Scgd ifelse(OP, `div', `neg Q', `neg R') 2700b9f5089Scgd1:') 2710b9f5089Scgd retl 2720b9f5089Scgd ifelse(OP, `div', `mov Q, %o0', `mov R, %o0') 273