xref: /netbsd-src/lib/libc/arch/sparc/gen/divrem.m4 (revision d2c615416b27528414191875ef85f51c10b3d2b6)
10b9f5089Scgd/*
20b9f5089Scgd * Copyright (c) 1992, 1993
30b9f5089Scgd *	The Regents of the University of California.  All rights reserved.
40b9f5089Scgd *
50b9f5089Scgd * This software was developed by the Computer Systems Engineering group
60b9f5089Scgd * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
70b9f5089Scgd * contributed to Berkeley.
80b9f5089Scgd *
90b9f5089Scgd * Redistribution and use in source and binary forms, with or without
100b9f5089Scgd * modification, are permitted provided that the following conditions
110b9f5089Scgd * are met:
120b9f5089Scgd * 1. Redistributions of source code must retain the above copyright
130b9f5089Scgd *    notice, this list of conditions and the following disclaimer.
140b9f5089Scgd * 2. Redistributions in binary form must reproduce the above copyright
150b9f5089Scgd *    notice, this list of conditions and the following disclaimer in the
160b9f5089Scgd *    documentation and/or other materials provided with the distribution.
17eb7c1594Sagc * 3. Neither the name of the University nor the names of its contributors
180b9f5089Scgd *    may be used to endorse or promote products derived from this software
190b9f5089Scgd *    without specific prior written permission.
200b9f5089Scgd *
210b9f5089Scgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
220b9f5089Scgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
230b9f5089Scgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
240b9f5089Scgd * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
250b9f5089Scgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
260b9f5089Scgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
270b9f5089Scgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
280b9f5089Scgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
290b9f5089Scgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
300b9f5089Scgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
310b9f5089Scgd * SUCH DAMAGE.
320b9f5089Scgd *
330b9f5089Scgd * from: Header: divrem.m4,v 1.4 92/06/25 13:23:57 torek Exp
34*d2c61541Smartin * $NetBSD: divrem.m4,v 1.6 2011/03/23 20:54:35 martin Exp $
350b9f5089Scgd */
360b9f5089Scgd
370b9f5089Scgd/*
380b9f5089Scgd * Division and remainder, from Appendix E of the Sparc Version 8
390b9f5089Scgd * Architecture Manual, with fixes from Gordon Irlam.
400b9f5089Scgd */
410b9f5089Scgd
420b9f5089Scgd#if defined(LIBC_SCCS) && !defined(lint)
430b9f5089Scgd	.asciz "@(#)divrem.m4	8.1 (Berkeley) 6/4/93"
440b9f5089Scgd#endif /* LIBC_SCCS and not lint */
450b9f5089Scgd
460b9f5089Scgd/*
470b9f5089Scgd * Input: dividend and divisor in %o0 and %o1 respectively.
480b9f5089Scgd *
490b9f5089Scgd * m4 parameters:
500b9f5089Scgd *  NAME	name of function to generate
510b9f5089Scgd *  OP		OP=div => %o0 / %o1; OP=rem => %o0 % %o1
520b9f5089Scgd *  S		S=true => signed; S=false => unsigned
530b9f5089Scgd *
540b9f5089Scgd * Algorithm parameters:
550b9f5089Scgd *  N		how many bits per iteration we try to get (4)
560b9f5089Scgd *  WORDSIZE	total number of bits (32)
570b9f5089Scgd *
580b9f5089Scgd * Derived constants:
590b9f5089Scgd *  TWOSUPN	2^N, for label generation (m4 exponentiation currently broken)
600b9f5089Scgd *  TOPBITS	number of bits in the top `decade' of a number
610b9f5089Scgd *
620b9f5089Scgd * Important variables:
630b9f5089Scgd *  Q		the partial quotient under development (initially 0)
640b9f5089Scgd *  R		the remainder so far, initially the dividend
650b9f5089Scgd *  ITER	number of main division loop iterations required;
660b9f5089Scgd *		equal to ceil(log2(quotient) / N).  Note that this
670b9f5089Scgd *		is the log base (2^N) of the quotient.
680b9f5089Scgd *  V		the current comparand, initially divisor*2^(ITER*N-1)
690b9f5089Scgd *
700b9f5089Scgd * Cost:
710b9f5089Scgd *  Current estimate for non-large dividend is
720b9f5089Scgd *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
730b9f5089Scgd *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
740b9f5089Scgd *  different path, as the upper bits of the quotient must be developed
750b9f5089Scgd *  one bit at a time.
760b9f5089Scgd */
770b9f5089Scgd
780b9f5089Scgddefine(N, `4')
790b9f5089Scgddefine(TWOSUPN, `16')
800b9f5089Scgddefine(WORDSIZE, `32')
810b9f5089Scgddefine(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))
820b9f5089Scgd
830b9f5089Scgddefine(dividend, `%o0')
840b9f5089Scgddefine(divisor, `%o1')
850b9f5089Scgddefine(Q, `%o2')
860b9f5089Scgddefine(R, `%o3')
870b9f5089Scgddefine(ITER, `%o4')
880b9f5089Scgddefine(V, `%o5')
890b9f5089Scgd
900b9f5089Scgd/* m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d */
910b9f5089Scgddefine(T, `%g1')
92*d2c61541Smartindefine(SC, `%g5')
930b9f5089Scgdifelse(S, `true', `define(SIGN, `%g6')')
940b9f5089Scgd
950b9f5089Scgd/*
960b9f5089Scgd * This is the recursive definition for developing quotient digits.
970b9f5089Scgd *
980b9f5089Scgd * Parameters:
990b9f5089Scgd *  $1	the current depth, 1 <= $1 <= N
1000b9f5089Scgd *  $2	the current accumulation of quotient bits
1010b9f5089Scgd *  N	max depth
1020b9f5089Scgd *
1030b9f5089Scgd * We add a new bit to $2 and either recurse or insert the bits in
1040b9f5089Scgd * the quotient.  R, Q, and V are inputs and outputs as defined above;
1050b9f5089Scgd * the condition codes are expected to reflect the input R, and are
1060b9f5089Scgd * modified to reflect the output R.
1070b9f5089Scgd */
1080b9f5089Scgddefine(DEVELOP_QUOTIENT_BITS,
1090b9f5089Scgd`	! depth $1, accumulated bits $2
1100b9f5089Scgd	bl	L.$1.eval(TWOSUPN+$2)
1110b9f5089Scgd	srl	V,1,V
1120b9f5089Scgd	! remainder is positive
1130b9f5089Scgd	subcc	R,V,R
1140b9f5089Scgd	ifelse($1, N,
1150b9f5089Scgd	`	b	9f
1160b9f5089Scgd		add	Q, ($2*2+1), Q
1170b9f5089Scgd	', `	DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')')
1180b9f5089ScgdL.$1.eval(TWOSUPN+$2):
1190b9f5089Scgd	! remainder is negative
1200b9f5089Scgd	addcc	R,V,R
1210b9f5089Scgd	ifelse($1, N,
1220b9f5089Scgd	`	b	9f
1230b9f5089Scgd		add	Q, ($2*2-1), Q
1240b9f5089Scgd	', `	DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')')
1250b9f5089Scgd	ifelse($1, 1, `9:')')
1260b9f5089Scgd
127368d2cb7Smrg#include <machine/asm.h>
1280b9f5089Scgd#include <machine/trap.h>
1290b9f5089Scgd
1300b9f5089ScgdFUNC(NAME)
1310b9f5089Scgdifelse(S, `true',
1320b9f5089Scgd`	! compute sign of result; if neither is negative, no problem
1330b9f5089Scgd	orcc	divisor, dividend, %g0	! either negative?
1340b9f5089Scgd	bge	2f			! no, go do the divide
135a3fa6016Spk	ifelse(OP, `div',
136a3fa6016Spk		`xor	divisor, dividend, SIGN',
137a3fa6016Spk		`mov	dividend, SIGN')	! compute sign in any case
1380b9f5089Scgd	tst	divisor
1390b9f5089Scgd	bge	1f
1400b9f5089Scgd	tst	dividend
1410b9f5089Scgd	! divisor is definitely negative; dividend might also be negative
1420b9f5089Scgd	bge	2f			! if dividend not negative...
1430b9f5089Scgd	neg	divisor			! in any case, make divisor nonneg
1440b9f5089Scgd1:	! dividend is negative, divisor is nonnegative
1450b9f5089Scgd	neg	dividend		! make dividend nonnegative
1460b9f5089Scgd2:
1470b9f5089Scgd')
1480b9f5089Scgd	! Ready to divide.  Compute size of quotient; scale comparand.
1490b9f5089Scgd	orcc	divisor, %g0, V
1500b9f5089Scgd	bnz	1f
1510b9f5089Scgd	mov	dividend, R
1520b9f5089Scgd
1530b9f5089Scgd		! Divide by zero trap.  If it returns, return 0 (about as
1540b9f5089Scgd		! wrong as possible, but that is what SunOS does...).
1550b9f5089Scgd		t	ST_DIV0
1560b9f5089Scgd		retl
1570b9f5089Scgd		clr	%o0
1580b9f5089Scgd
1590b9f5089Scgd1:
1600b9f5089Scgd	cmp	R, V			! if divisor exceeds dividend, done
1610b9f5089Scgd	blu	Lgot_result		! (and algorithm fails otherwise)
1620b9f5089Scgd	clr	Q
1630b9f5089Scgd	sethi	%hi(1 << (WORDSIZE - TOPBITS - 1)), T
1640b9f5089Scgd	cmp	R, T
1650b9f5089Scgd	blu	Lnot_really_big
1660b9f5089Scgd	clr	ITER
1670b9f5089Scgd
1680b9f5089Scgd	! `Here the dividend is >= 2^(31-N) or so.  We must be careful here,
1690b9f5089Scgd	! as our usual N-at-a-shot divide step will cause overflow and havoc.
1700b9f5089Scgd	! The number of bits in the result here is N*ITER+SC, where SC <= N.
1710b9f5089Scgd	! Compute ITER in an unorthodox manner: know we need to shift V into
1720b9f5089Scgd	! the top decade: so do not even bother to compare to R.'
1730b9f5089Scgd	1:
1740b9f5089Scgd		cmp	V, T
1750b9f5089Scgd		bgeu	3f
1760b9f5089Scgd		mov	1, SC
1770b9f5089Scgd		sll	V, N, V
1780b9f5089Scgd		b	1b
1790b9f5089Scgd		inc	ITER
1800b9f5089Scgd
1810b9f5089Scgd	! Now compute SC.
1820b9f5089Scgd	2:	addcc	V, V, V
1830b9f5089Scgd		bcc	Lnot_too_big
1840b9f5089Scgd		inc	SC
1850b9f5089Scgd
1860b9f5089Scgd		! We get here if the divisor overflowed while shifting.
1870b9f5089Scgd		! This means that R has the high-order bit set.
1880b9f5089Scgd		! Restore V and subtract from R.
1890b9f5089Scgd		sll	T, TOPBITS, T	! high order bit
1900b9f5089Scgd		srl	V, 1, V		! rest of V
1910b9f5089Scgd		add	V, T, V
1920b9f5089Scgd		b	Ldo_single_div
1930b9f5089Scgd		dec	SC
1940b9f5089Scgd
1950b9f5089Scgd	Lnot_too_big:
1960b9f5089Scgd	3:	cmp	V, R
1970b9f5089Scgd		blu	2b
1980b9f5089Scgd		nop
1990b9f5089Scgd		be	Ldo_single_div
2000b9f5089Scgd		nop
2010b9f5089Scgd	/* NB: these are commented out in the V8-Sparc manual as well */
2020b9f5089Scgd	/* (I do not understand this) */
2030b9f5089Scgd	! V > R: went too far: back up 1 step
2040b9f5089Scgd	!	srl	V, 1, V
2050b9f5089Scgd	!	dec	SC
2060b9f5089Scgd	! do single-bit divide steps
2070b9f5089Scgd	!
2080b9f5089Scgd	! We have to be careful here.  We know that R >= V, so we can do the
2090b9f5089Scgd	! first divide step without thinking.  BUT, the others are conditional,
2100b9f5089Scgd	! and are only done if R >= 0.  Because both R and V may have the high-
2110b9f5089Scgd	! order bit set in the first step, just falling into the regular
2120b9f5089Scgd	! division loop will mess up the first time around.
2130b9f5089Scgd	! So we unroll slightly...
2140b9f5089Scgd	Ldo_single_div:
2150b9f5089Scgd		deccc	SC
2160b9f5089Scgd		bl	Lend_regular_divide
2170b9f5089Scgd		nop
2180b9f5089Scgd		sub	R, V, R
2190b9f5089Scgd		mov	1, Q
2200b9f5089Scgd		b	Lend_single_divloop
2210b9f5089Scgd		nop
2220b9f5089Scgd	Lsingle_divloop:
2230b9f5089Scgd		sll	Q, 1, Q
2240b9f5089Scgd		bl	1f
2250b9f5089Scgd		srl	V, 1, V
2260b9f5089Scgd		! R >= 0
2270b9f5089Scgd		sub	R, V, R
2280b9f5089Scgd		b	2f
2290b9f5089Scgd		inc	Q
2300b9f5089Scgd	1:	! R < 0
2310b9f5089Scgd		add	R, V, R
2320b9f5089Scgd		dec	Q
2330b9f5089Scgd	2:
2340b9f5089Scgd	Lend_single_divloop:
2350b9f5089Scgd		deccc	SC
2360b9f5089Scgd		bge	Lsingle_divloop
2370b9f5089Scgd		tst	R
2380b9f5089Scgd		b,a	Lend_regular_divide
2390b9f5089Scgd
2400b9f5089ScgdLnot_really_big:
2410b9f5089Scgd1:
2420b9f5089Scgd	sll	V, N, V
2430b9f5089Scgd	cmp	V, R
2440b9f5089Scgd	bleu	1b
2450b9f5089Scgd	inccc	ITER
2460b9f5089Scgd	be	Lgot_result
2470b9f5089Scgd	dec	ITER
2480b9f5089Scgd
2490b9f5089Scgd	tst	R	! set up for initial iteration
2500b9f5089ScgdLdivloop:
2510b9f5089Scgd	sll	Q, N, Q
2520b9f5089Scgd	DEVELOP_QUOTIENT_BITS(1, 0)
2530b9f5089ScgdLend_regular_divide:
2540b9f5089Scgd	deccc	ITER
2550b9f5089Scgd	bge	Ldivloop
2560b9f5089Scgd	tst	R
2570b9f5089Scgd	bl,a	Lgot_result
2580b9f5089Scgd	! non-restoring fixup here (one instruction only!)
2590b9f5089Scgdifelse(OP, `div',
2600b9f5089Scgd`	dec	Q
2610b9f5089Scgd', `	add	R, divisor, R
2620b9f5089Scgd')
2630b9f5089Scgd
2640b9f5089ScgdLgot_result:
2650b9f5089Scgdifelse(S, `true',
2660b9f5089Scgd`	! check to see if answer should be < 0
2670b9f5089Scgd	tst	SIGN
2680b9f5089Scgd	bl,a	1f
2690b9f5089Scgd	ifelse(OP, `div', `neg Q', `neg R')
2700b9f5089Scgd1:')
2710b9f5089Scgd	retl
2720b9f5089Scgd	ifelse(OP, `div', `mov Q, %o0', `mov R, %o0')
273