xref: /plan9/sys/src/cmd/gs/src/iutilasm.asm (revision 593dc095aefb2a85c828727bbfa9da139a49bdf4)
1;    Copyright (C) 1989, 1992, 1993 Aladdin Enterprises.  All rights reserved.
2;
3; This software is provided AS-IS with no warranty, either express or
4; implied.
5;
6; This software is distributed under license and may not be copied,
7; modified or distributed except as expressly authorized under the terms
8; of the license contained in the file LICENSE in this distribution.
9;
10; For more information about licensing, please refer to
11; http://www.ghostscript.com/licensing/. For information on
12; commercial licensing, go to http://www.artifex.com/licensing/ or
13; contact Artifex Software, Inc., 101 Lucas Valley Road #110,
14; San Rafael, CA  94903, U.S.A., +1(415)492-9861.
15
16; $Id: iutilasm.asm,v 1.4 2002/02/21 22:24:53 giles Exp $
17; iutilasm.asm
18; Assembly code for Ghostscript interpreter on MS-DOS systems
19
20	ifdef	FOR80386
21
22	.286c
23
24	endif
25
26utilasm_TEXT	SEGMENT	WORD PUBLIC 'CODE'
27	ASSUME	CS:utilasm_TEXT
28
29
30	ifdef	FOR80386
31
32; Macro for 32-bit operand prefix.
33OP32	macro
34	db	66h
35	endm
36
37	endif					; FOR80386
38
39; Clear a register
40
41clear	macro	reg
42	xor	reg,reg
43	endm
44
45
46	ifdef	FOR80386
47
48; Replace the multiply and divide routines in the Turbo C library
49; if we are running on an 80386.
50
51; Macro to swap the halves of a 32-bit register.
52; Unfortunately, masm won't allow a shift instruction with a count of 16,
53; so we have to code it in hex.
54swap	macro	regno
55	  OP32
56	db	0c1h,0c0h+regno,16		; rol regno,16
57	endm
58regax	equ	0
59regcx	equ	1
60regdx	equ	2
61regbx	equ	3
62
63
64; Multiply (dx,ax) by (cx,bx) to (dx,ax).
65	PUBLIC	LXMUL@
66	PUBLIC	F_LXMUL@
67F_LXMUL@ proc	far
68LXMUL@	proc	far
69	swap	regdx
70	mov	dx,ax
71	swap	regcx
72	mov	cx,bx
73	  OP32
74	db	0fh,0afh,0d1h			; imul dx,cx
75	  OP32
76	mov	ax,dx
77	swap	regdx
78	ret
79LXMUL@	endp
80F_LXMUL@ endp
81
82
83; Divide two stack operands, leave the result in (dx,ax).
84
85	ifdef	DEBUG
86
87setup32	macro
88	mov	bx,sp
89	push	bp
90	mov	bp,sp
91	  OP32
92	mov	ax,ss:[bx+4]			; dividend
93	endm
94
95ret32	macro	n
96	mov	sp,bp
97	pop	bp
98	ret	n
99	endm
100
101	else					; !DEBUG
102
103setup32	macro
104	mov	bx,sp
105	  OP32
106	mov	ax,ss:[bx+4]			; dividend
107	endm
108
109ret32	macro	n
110	ret	n
111	endm
112
113	endif					; (!)DEBUG
114
115	PUBLIC	LDIV@, LUDIV@, LMOD@, LUMOD@
116	PUBLIC	F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@
117F_LDIV@	proc	far
118LDIV@	proc	far
119	setup32
120	  OP32
121	cwd
122	  OP32
123	idiv	word ptr ss:[bx+8]		; divisor
124	  OP32
125	mov	dx,ax
126	swap	regdx
127	ret32	8
128LDIV@	endp
129F_LDIV@	endp
130F_LUDIV@ proc	far
131LUDIV@	proc	far
132	setup32
133	  OP32
134	xor	dx,dx
135	  OP32
136	div	word ptr ss:[bx+8]		; divisor
137	  OP32
138	mov	dx,ax
139	swap	regdx
140	ret32	8
141LUDIV@	endp
142F_LUDIV@ endp
143F_LMOD@	proc	far
144LMOD@	proc	far
145	setup32
146	  OP32
147	cwd
148	  OP32
149	idiv	word ptr ss:[bx+8]		; divisor
150	  OP32
151	mov	ax,dx
152	swap	regdx
153	ret32	8
154LMOD@	endp
155F_LMOD@	endp
156F_LUMOD@ proc	far
157LUMOD@	proc	far
158	setup32
159	  OP32
160	xor	dx,dx
161	  OP32
162	div	word ptr ss:[bx+8]		; divisor
163	  OP32
164	mov	ax,dx
165	swap	regdx
166	ret32	8
167LUMOD@	endp
168F_LUMOD@ endp
169
170	else					; !FOR80386
171
172; Replace the divide routines in the Turbo C library,
173; which do the division one bit at a time (!).
174
175	PUBLIC	LDIV@, LMOD@, LUDIV@, LUMOD@
176	PUBLIC	F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@
177
178; Negate a long on the stack.
179negbp	macro	offset
180	neg	word ptr [bp+offset+2]		; high part
181	neg	word ptr [bp+offset]		; low part
182	sbb	word ptr [bp+offset+2],0
183	endm
184
185; Negate a long in (dx,ax).
186negr	macro
187	neg	dx
188	neg	ax
189	sbb	dx,0
190	endm
191
192; Divide two unsigned longs on the stack.
193; Leave either the quotient or the remainder in (dx,ax).
194; Operand offsets assume that bp (and only bp) has been pushed.
195nlo	equ	6
196nhi	equ	8
197dlo	equ	10
198dhi	equ	12
199
200; We use an offset in bx to distinguish div from mod,
201; and to indicate whether the result should be negated.
202odiv	equ	0
203omod	equ	2
204odivneg	equ	4
205omodneg	equ	6
206F_LMOD@	proc	far
207LMOD@	proc	far
208	push	bp
209	mov	bp,sp
210	mov	bx,omod
211			; Take abs of denominator
212	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
213	jge	modpd
214	negbp	dlo
215modpd:			; Negate mod if numerator < 0
216	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
217	jge	udiv
218	mov	bx,omodneg
219negnum:	negbp	nlo
220	jmp	udiv
221LMOD@	endp
222F_LMOD@	endp
223F_LUMOD@ proc	far
224LUMOD@	proc	far
225	mov	bx,omod
226	jmp	udpush
227LUMOD@	endp
228F_LUMOD@ endp
229F_LDIV@	proc	far
230LDIV@	proc	far
231	push	bp
232	mov	bp,sp
233	mov	bx,odiv
234			; Negate quo if num^den < 0
235	mov	ax,[bp+nhi]
236	xor	ax,[bp+dhi]
237	jge	divabs
238	mov	bx,odivneg
239divabs:			; Take abs of denominator
240	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
241	jge	divpd
242	negbp	dlo
243divpd:			; Take abs of numerator
244	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
245	jge	udiv
246	jmp	negnum
247LDIV@	endp
248F_LDIV@	endp
249F_LUDIV@ proc	far
250LUDIV@	proc	far
251	mov	bx,odiv
252udpush:	push	bp
253	mov	bp,sp
254udiv:	push	bx				; odiv, omod, odivneg, omodneg
255	mov	ax,[bp+nlo]
256	mov	dx,[bp+nhi]
257	mov	bx,[bp+dlo]
258	mov	cx,[bp+dhi]
259; Now we are dividing dx:ax by cx:bx.
260; Check to see whether this is really a 32/16 division.
261	or	cx,cx
262	jnz	div2
263; 32/16, check for 16- vs. 32-bit quotient
264	cmp	dx,bx
265	jae	div1
266; 32/16 with 16-bit quotient, just do it.
267	div	bx				; ax = quo, dx = rem
268	pop	bx
269	pop	bp
270	jmp	cs:xx1[bx]
271	even
272xx1	dw	offset divx1
273	dw	offset modx1
274	dw	offset divx1neg
275	dw	offset modx1neg
276modx1:	mov	ax,dx
277divx1:	xor	dx,dx
278	ret	8
279modx1neg: mov	ax,dx
280divx1neg: xor	dx,dx
281rneg:	negr
282	ret	8
283; 32/16 with 32-bit quotient, do in 2 parts.
284div1:	mov	cx,ax				; save lo num
285	mov	ax,dx
286	xor	dx,dx
287	div	bx				; ax = hi quo
288	xchg	cx,ax				; save hi quo, get lo num
289	div	bx				; ax = lo quo, dx = rem
290	pop	bx
291	pop	bp
292	jmp	cs:xx1a[bx]
293	even
294xx1a	dw	offset divx1a
295	dw	offset modx1
296	dw	offset divx1aneg
297	dw	offset modx1neg
298divx1a:	mov	dx,cx				; hi quo
299	ret	8
300divx1aneg: mov	dx,cx
301	jmp	rneg
302; This is really a 32/32 bit division.
303; (Note that the quotient cannot exceed 16 bits.)
304; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2
305; (first edition).
306; Start by normalizing the numerator and denominator.
307div2:	or	ch,ch
308	jz	div21				; ch == 0, but cl != 0
309; Do 8 steps all at once.
310	mov	bl,bh
311	mov	bh,cl
312	mov	cl,ch
313	xor	ch,ch
314	mov	al,ah
315	mov	ah,dl
316	mov	dl,dh
317	xor	dh,dh
318	rol	bx,1				; faster than jmp
319div2a:	rcr	bx,1				; finish previous shift
320div21:	shr	dx,1
321	rcr	ax,1
322	shr	cx,1
323	jnz	div2a
324	rcr	bx,1
325; Now we can do a 32/16 divide.
326div2x:	div	bx				; ax = quo, dx = rem
327; Multiply by the denominator, and correct the result.
328	mov	cx,ax				; save quotient
329	mul	word ptr [bp+dhi]
330	mov	bx,ax				; save lo part of hi product
331	mov	ax,cx
332	mul	word ptr [bp+dlo]
333	add	dx,bx
334; Now cx = trial quotient, (dx,ax) = cx * denominator.
335	not	dx
336	neg	ax
337	cmc
338	adc	dx,0				; double-precision neg
339	jc	divz				; zero quotient
340						; requires special handling
341	add	ax,[bp+nlo]
342	adc	dx,[bp+nhi]
343	jc	divx
344; Quotient is too large, adjust it.
345div3:	dec	cx
346	add	ax,[bp+dlo]
347	adc	dx,[bp+dhi]
348	jnc	div3
349; All done.  (dx,ax) = remainder, cx = lo quotient.
350divx:	pop	bx
351	pop	bp
352	jmp	cs:xx3[bx]
353	even
354xx3	dw	offset divx3
355	dw	offset modx3
356	dw	offset divx3neg
357	dw	offset modx3neg
358divx3:	mov	ax,cx
359	xor	dx,dx
360modx3:	ret	8
361divx3neg: mov	ax,cx
362	xor	dx,dx
363modx3neg: jmp	rneg
364; Handle zero quotient specially.
365divz:	pop	bx
366	jmp	cs:xxz[bx]
367	even
368xxz	dw	offset divxz
369	dw	offset modxz
370	dw	offset divxz
371	dw	offset modxzneg
372divxz:	pop	bp
373	ret	8
374modxzneg: negbp	nlo
375modxz:	mov	ax,[bp+nlo]
376	mov	dx,[bp+nhi]
377	pop	bp
378	ret	8
379LUDIV@	endp
380F_LUDIV@ endp
381
382	endif					; FOR80386
383
384
385	ifdef	NOFPU
386
387; See gsmisc.c for the C version of this code.
388
389; /*
390;  * Floating multiply with fixed result, for avoiding floating point in
391;  * common coordinate transformations.  Assumes IEEE representation,
392;  * 16-bit short, 32-bit long.  Optimized for the case where the first
393;  * operand has no more than 16 mantissa bits, e.g., where it is a user space
394;  * coordinate (which are often integers).
395;  *
396;  * The assembly language version of this code is actually faster than
397;  * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking
398;  * a trap on every FPU operation).  If there is no FPU, the assembly
399;  * language version of this code is over 10 times as fast as the
400;  * emulated FPU.
401;  */
402; fixed
403; fmul2fixed_(long /*float*/ a, long /*float*/ b)
404; {
405
406	PUBLIC	_fmul2fixed_
407_fmul2fixed_ proc far
408	push	bp
409	mov	bp,sp
410a	equ	6
411alo	equ	a
412ahi	equ	a+2
413b	equ	10
414blo	equ	b
415bhi	equ	b+2
416	push	si		; will hold ma
417	push	di		; will hold mb
418
419; 	int e = 260 + _fixed_shift - ((
420; 		(((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80)
421; 	  ) >> 7);
422
423	mov	dx,[bp+ahi]
424; dfmul2fixed enters here
425fmf:	mov	cx,260+12
426	mov	ax,[bp+bhi]
427	and	ax,7f80h
428	and	dx,7f80h
429	add	ax,dx
430	xchg	ah,al		; ror ax,7 without using cl
431	rol	ax,1
432	sub	cx,ax
433	push	cx		; e
434
435; 	ulong ma = (ushort)(a >> 8) | 0x8000;
436; 	ulong mb = (ushort)(b >> 8) | 0x8000;
437
438	mov	si,[bp+alo+1]	; unaligned
439	clear	ax
440	mov	di,[bp+blo+1]	; unaligned
441	or	si,8000h
442	or	di,8000h
443
444; 	ulong p1 = ma * (b & 0xff);
445
446	mov	al,[bp+blo]
447	mul	si
448
449;			(Do this later:)
450; 	ulong p = ma * mb;
451
452; 	if ( (byte)a )		/* >16 mantissa bits */
453
454	cmp	byte ptr [bp+alo],0
455	je	mshort
456
457; 	{	ulong p2 = (a & 0xff) * mb;
458; 		p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8;
459
460	mov	cx,dx
461	mov	bx,ax
462	clear	ax
463	mov	al,[bp+alo]
464	clear	dx
465	mov	dl,[bp+blo]
466	mul	dx
467	mov	dl,ah		; dx is zero
468	add	bx,cx
469	adc	cx,0
470	clear	ax
471	mov	al,[bp+alo]
472	mul	di
473	add	ax,bx
474	adc	dx,cx
475
476; 	}
477
478mshort:
479
480; 	else
481; 		p += p1 >> 8;
482
483	mov	bl,ah		; set (cx,bx) = (dx,ax) >> 8
484	mov	bh,dl
485	clear	cx
486	mov	cl,dh
487	mov	ax,si
488	mul	di
489	add	ax,bx
490	adc	dx,cx
491
492; 	if ( (uint)e < 32 )		/* e = -1 is possible */
493
494	pop	cx		; e
495	cmp	cx,16
496	jb	shr1
497
498; 	else if ( e >= 32 )		/* also detects a=0 or b=0 */
499
500	cmp	cx,0
501	jl	eneg
502	sub	cx,16
503	cmp	cx,16
504	jge	shr0
505	mov	ax,dx
506	clear	dx
507	shr	ax,cl
508	jmp	ex
509
510; 		return fixed_0;
511
512shr0:	clear	ax
513	clear	dx
514	jmp	ex
515
516; 	else
517; 		p <<= -e;
518
519	even
520eneg:	neg	cx
521	shl	dx,cl
522	mov	bx,ax
523	shl	ax,cl
524	rol	bx,cl
525	xor	bx,ax
526	add	dx,bx
527	jmp	ex
528
529; 		p >>= e;
530
531	even
532shr1:	shr	ax,cl
533	mov	bx,dx
534	shr	dx,cl
535	ror	bx,cl
536	xor	bx,dx
537	add	ax,bx
538
539ex:
540
541; 	return ((a ^ b) < 0 ? -p : p);
542
543	mov	cx,[bp+ahi]
544	xor	cx,[bp+bhi]
545	jge	pos
546	neg	dx
547	neg	ax
548	sbb	dx,0
549pos:
550
551; }
552
553retu:	pop	di
554	pop	si
555	mov	sp,bp
556	pop	bp
557	ret
558
559_fmul2fixed_ ENDP
560
561; The same routine with the first argument a double rather than a float.
562; The argument is split into two pieces to reduce data movement.
563
564	PUBLIC	_dfmul2fixed_
565_dfmul2fixed_ proc far
566	push	bp
567	mov	bp,sp
568xalo	equ	6
569;b	equ	10
570xahi	equ	14
571	push	si		; overlap this below
572	push	di		; ditto
573
574; Shuffle the arguments and then use fmul2fixed.
575
576; Squeeze 3 exponent bits out of the top 35 bits of a.
577
578	mov	dx,[bp+xahi+2]
579	mov	bx,0c000h
580	mov	ax,[bp+xahi]
581	and	bx,dx
582	mov	cx,[bp+xalo+2]
583	and	dx,7ffh		; get rid of discarded bits
584	add	cx,cx		; faster than shl!
585	jz	cz		; detect common case
586	adc	ax,ax		; faster than rcl!
587	adc	dx,dx
588	add	cx,cx
589	adc	ax,ax
590	adc	dx,dx
591	add	cx,cx
592	adc	ax,ax
593	mov	[bp+alo],ax
594	adc	dx,dx
595	or	dx,bx
596	mov	[bp+ahi],dx
597	jmp	fmf
598	even
599cz:	adc	ax,ax
600	adc	dx,dx
601	add	ax,ax
602	adc	dx,dx
603	add	ax,ax
604	mov	[bp+alo],ax
605	adc	dx,dx
606	or	dx,bx
607	mov	[bp+ahi],dx
608	jmp	fmf
609
610_dfmul2fixed_ ENDP
611
612	endif					; NOFPU
613
614
615; Transpose an 8x8 bit matrix.  See gsmisc.c for the algorithm in C.
616	PUBLIC	_memflip8x8
617_memflip8x8 proc far
618	push	ds
619	push	si
620	push	di
621		; After pushing, the offsets of the parameters are:
622		; byte *inp=10, int line_size=14, byte *outp=16, int dist=20.
623	mov	si,sp
624	mov	di,ss:[si+14]			; line_size
625	lds	si,ss:[si+10]			; inp
626		; We assign variables to registers as follows:
627		; ax = AE, bx = BF, cx (or di) = CG, dx = DH.
628		; Load the input data.  Initially we assign
629		; ax = AB, bx = EF, cx (or di) = CD, dx = GH.
630	mov	ah,[si]
631iload	macro	reg
632	add	si,di
633	mov	reg,[si]
634	endm
635	iload	al
636	iload	ch
637	iload	cl
638	iload	bh
639	iload	bl
640	iload	dh
641	iload	dl
642		; Transposition macro, see C code for explanation.
643trans	macro	reg1,reg2,shift,mask
644	mov	si,reg1
645	shr	si,shift
646	xor	si,reg2
647	and	si,mask
648	xor	reg2,si
649	shl	si,shift
650	xor	reg1,si
651	endm
652		; Do 4x4 transpositions
653	mov	di,cx			; we need cl for the shift count
654	mov	cl,4
655	trans	bx,ax,cl,0f0fh
656	trans	dx,di,cl,0f0fh
657		; Swap B/E, D/G
658	xchg	al,bh
659	mov	cx,di
660	xchg	cl,dh
661		; Do 2x2 transpositions
662	mov	di,cx				; need cl again
663	mov	cl,2
664	trans	di,ax,cl,3333h
665	trans	dx,bx,cl,3333h
666	mov	cx,di				; done shifting >1
667		; Do 1x1 transpositions
668	trans	bx,ax,1,5555h
669	trans	dx,cx,1,5555h
670		; Store result
671	mov	si,sp
672	mov	di,ss:[si+20]			; dist
673	lds	si,ss:[si+16]			; outp
674	mov	[si],ah
675istore	macro	reg
676	add	si,di
677	mov	[si],reg
678	endm
679	istore	bh
680	istore	ch
681	istore	dh
682	istore	al
683	istore	bl
684	istore	cl
685	istore	dl
686		; All done
687	pop	di
688	pop	si
689	pop	ds
690	ret
691_memflip8x8 ENDP
692
693
694utilasm_TEXT ENDS
695	END
696