1; Copyright (C) 1989, 1992, 1993 Aladdin Enterprises. All rights reserved. 2; 3; This software is provided AS-IS with no warranty, either express or 4; implied. 5; 6; This software is distributed under license and may not be copied, 7; modified or distributed except as expressly authorized under the terms 8; of the license contained in the file LICENSE in this distribution. 9; 10; For more information about licensing, please refer to 11; http://www.ghostscript.com/licensing/. For information on 12; commercial licensing, go to http://www.artifex.com/licensing/ or 13; contact Artifex Software, Inc., 101 Lucas Valley Road #110, 14; San Rafael, CA 94903, U.S.A., +1(415)492-9861. 15 16; $Id: iutilasm.asm,v 1.4 2002/02/21 22:24:53 giles Exp $ 17; iutilasm.asm 18; Assembly code for Ghostscript interpreter on MS-DOS systems 19 20 ifdef FOR80386 21 22 .286c 23 24 endif 25 26utilasm_TEXT SEGMENT WORD PUBLIC 'CODE' 27 ASSUME CS:utilasm_TEXT 28 29 30 ifdef FOR80386 31 32; Macro for 32-bit operand prefix. 33OP32 macro 34 db 66h 35 endm 36 37 endif ; FOR80386 38 39; Clear a register 40 41clear macro reg 42 xor reg,reg 43 endm 44 45 46 ifdef FOR80386 47 48; Replace the multiply and divide routines in the Turbo C library 49; if we are running on an 80386. 50 51; Macro to swap the halves of a 32-bit register. 52; Unfortunately, masm won't allow a shift instruction with a count of 16, 53; so we have to code it in hex. 54swap macro regno 55 OP32 56 db 0c1h,0c0h+regno,16 ; rol regno,16 57 endm 58regax equ 0 59regcx equ 1 60regdx equ 2 61regbx equ 3 62 63 64; Multiply (dx,ax) by (cx,bx) to (dx,ax). 65 PUBLIC LXMUL@ 66 PUBLIC F_LXMUL@ 67F_LXMUL@ proc far 68LXMUL@ proc far 69 swap regdx 70 mov dx,ax 71 swap regcx 72 mov cx,bx 73 OP32 74 db 0fh,0afh,0d1h ; imul dx,cx 75 OP32 76 mov ax,dx 77 swap regdx 78 ret 79LXMUL@ endp 80F_LXMUL@ endp 81 82 83; Divide two stack operands, leave the result in (dx,ax). 84 85 ifdef DEBUG 86 87setup32 macro 88 mov bx,sp 89 push bp 90 mov bp,sp 91 OP32 92 mov ax,ss:[bx+4] ; dividend 93 endm 94 95ret32 macro n 96 mov sp,bp 97 pop bp 98 ret n 99 endm 100 101 else ; !DEBUG 102 103setup32 macro 104 mov bx,sp 105 OP32 106 mov ax,ss:[bx+4] ; dividend 107 endm 108 109ret32 macro n 110 ret n 111 endm 112 113 endif ; (!)DEBUG 114 115 PUBLIC LDIV@, LUDIV@, LMOD@, LUMOD@ 116 PUBLIC F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@ 117F_LDIV@ proc far 118LDIV@ proc far 119 setup32 120 OP32 121 cwd 122 OP32 123 idiv word ptr ss:[bx+8] ; divisor 124 OP32 125 mov dx,ax 126 swap regdx 127 ret32 8 128LDIV@ endp 129F_LDIV@ endp 130F_LUDIV@ proc far 131LUDIV@ proc far 132 setup32 133 OP32 134 xor dx,dx 135 OP32 136 div word ptr ss:[bx+8] ; divisor 137 OP32 138 mov dx,ax 139 swap regdx 140 ret32 8 141LUDIV@ endp 142F_LUDIV@ endp 143F_LMOD@ proc far 144LMOD@ proc far 145 setup32 146 OP32 147 cwd 148 OP32 149 idiv word ptr ss:[bx+8] ; divisor 150 OP32 151 mov ax,dx 152 swap regdx 153 ret32 8 154LMOD@ endp 155F_LMOD@ endp 156F_LUMOD@ proc far 157LUMOD@ proc far 158 setup32 159 OP32 160 xor dx,dx 161 OP32 162 div word ptr ss:[bx+8] ; divisor 163 OP32 164 mov ax,dx 165 swap regdx 166 ret32 8 167LUMOD@ endp 168F_LUMOD@ endp 169 170 else ; !FOR80386 171 172; Replace the divide routines in the Turbo C library, 173; which do the division one bit at a time (!). 174 175 PUBLIC LDIV@, LMOD@, LUDIV@, LUMOD@ 176 PUBLIC F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@ 177 178; Negate a long on the stack. 179negbp macro offset 180 neg word ptr [bp+offset+2] ; high part 181 neg word ptr [bp+offset] ; low part 182 sbb word ptr [bp+offset+2],0 183 endm 184 185; Negate a long in (dx,ax). 186negr macro 187 neg dx 188 neg ax 189 sbb dx,0 190 endm 191 192; Divide two unsigned longs on the stack. 193; Leave either the quotient or the remainder in (dx,ax). 194; Operand offsets assume that bp (and only bp) has been pushed. 195nlo equ 6 196nhi equ 8 197dlo equ 10 198dhi equ 12 199 200; We use an offset in bx to distinguish div from mod, 201; and to indicate whether the result should be negated. 202odiv equ 0 203omod equ 2 204odivneg equ 4 205omodneg equ 6 206F_LMOD@ proc far 207LMOD@ proc far 208 push bp 209 mov bp,sp 210 mov bx,omod 211 ; Take abs of denominator 212 cmp byte ptr [bp+dhi+1],bh ; bh = 0 213 jge modpd 214 negbp dlo 215modpd: ; Negate mod if numerator < 0 216 cmp byte ptr [bp+nhi+1],bh ; bh = 0 217 jge udiv 218 mov bx,omodneg 219negnum: negbp nlo 220 jmp udiv 221LMOD@ endp 222F_LMOD@ endp 223F_LUMOD@ proc far 224LUMOD@ proc far 225 mov bx,omod 226 jmp udpush 227LUMOD@ endp 228F_LUMOD@ endp 229F_LDIV@ proc far 230LDIV@ proc far 231 push bp 232 mov bp,sp 233 mov bx,odiv 234 ; Negate quo if num^den < 0 235 mov ax,[bp+nhi] 236 xor ax,[bp+dhi] 237 jge divabs 238 mov bx,odivneg 239divabs: ; Take abs of denominator 240 cmp byte ptr [bp+dhi+1],bh ; bh = 0 241 jge divpd 242 negbp dlo 243divpd: ; Take abs of numerator 244 cmp byte ptr [bp+nhi+1],bh ; bh = 0 245 jge udiv 246 jmp negnum 247LDIV@ endp 248F_LDIV@ endp 249F_LUDIV@ proc far 250LUDIV@ proc far 251 mov bx,odiv 252udpush: push bp 253 mov bp,sp 254udiv: push bx ; odiv, omod, odivneg, omodneg 255 mov ax,[bp+nlo] 256 mov dx,[bp+nhi] 257 mov bx,[bp+dlo] 258 mov cx,[bp+dhi] 259; Now we are dividing dx:ax by cx:bx. 260; Check to see whether this is really a 32/16 division. 261 or cx,cx 262 jnz div2 263; 32/16, check for 16- vs. 32-bit quotient 264 cmp dx,bx 265 jae div1 266; 32/16 with 16-bit quotient, just do it. 267 div bx ; ax = quo, dx = rem 268 pop bx 269 pop bp 270 jmp cs:xx1[bx] 271 even 272xx1 dw offset divx1 273 dw offset modx1 274 dw offset divx1neg 275 dw offset modx1neg 276modx1: mov ax,dx 277divx1: xor dx,dx 278 ret 8 279modx1neg: mov ax,dx 280divx1neg: xor dx,dx 281rneg: negr 282 ret 8 283; 32/16 with 32-bit quotient, do in 2 parts. 284div1: mov cx,ax ; save lo num 285 mov ax,dx 286 xor dx,dx 287 div bx ; ax = hi quo 288 xchg cx,ax ; save hi quo, get lo num 289 div bx ; ax = lo quo, dx = rem 290 pop bx 291 pop bp 292 jmp cs:xx1a[bx] 293 even 294xx1a dw offset divx1a 295 dw offset modx1 296 dw offset divx1aneg 297 dw offset modx1neg 298divx1a: mov dx,cx ; hi quo 299 ret 8 300divx1aneg: mov dx,cx 301 jmp rneg 302; This is really a 32/32 bit division. 303; (Note that the quotient cannot exceed 16 bits.) 304; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2 305; (first edition). 306; Start by normalizing the numerator and denominator. 307div2: or ch,ch 308 jz div21 ; ch == 0, but cl != 0 309; Do 8 steps all at once. 310 mov bl,bh 311 mov bh,cl 312 mov cl,ch 313 xor ch,ch 314 mov al,ah 315 mov ah,dl 316 mov dl,dh 317 xor dh,dh 318 rol bx,1 ; faster than jmp 319div2a: rcr bx,1 ; finish previous shift 320div21: shr dx,1 321 rcr ax,1 322 shr cx,1 323 jnz div2a 324 rcr bx,1 325; Now we can do a 32/16 divide. 326div2x: div bx ; ax = quo, dx = rem 327; Multiply by the denominator, and correct the result. 328 mov cx,ax ; save quotient 329 mul word ptr [bp+dhi] 330 mov bx,ax ; save lo part of hi product 331 mov ax,cx 332 mul word ptr [bp+dlo] 333 add dx,bx 334; Now cx = trial quotient, (dx,ax) = cx * denominator. 335 not dx 336 neg ax 337 cmc 338 adc dx,0 ; double-precision neg 339 jc divz ; zero quotient 340 ; requires special handling 341 add ax,[bp+nlo] 342 adc dx,[bp+nhi] 343 jc divx 344; Quotient is too large, adjust it. 345div3: dec cx 346 add ax,[bp+dlo] 347 adc dx,[bp+dhi] 348 jnc div3 349; All done. (dx,ax) = remainder, cx = lo quotient. 350divx: pop bx 351 pop bp 352 jmp cs:xx3[bx] 353 even 354xx3 dw offset divx3 355 dw offset modx3 356 dw offset divx3neg 357 dw offset modx3neg 358divx3: mov ax,cx 359 xor dx,dx 360modx3: ret 8 361divx3neg: mov ax,cx 362 xor dx,dx 363modx3neg: jmp rneg 364; Handle zero quotient specially. 365divz: pop bx 366 jmp cs:xxz[bx] 367 even 368xxz dw offset divxz 369 dw offset modxz 370 dw offset divxz 371 dw offset modxzneg 372divxz: pop bp 373 ret 8 374modxzneg: negbp nlo 375modxz: mov ax,[bp+nlo] 376 mov dx,[bp+nhi] 377 pop bp 378 ret 8 379LUDIV@ endp 380F_LUDIV@ endp 381 382 endif ; FOR80386 383 384 385 ifdef NOFPU 386 387; See gsmisc.c for the C version of this code. 388 389; /* 390; * Floating multiply with fixed result, for avoiding floating point in 391; * common coordinate transformations. Assumes IEEE representation, 392; * 16-bit short, 32-bit long. Optimized for the case where the first 393; * operand has no more than 16 mantissa bits, e.g., where it is a user space 394; * coordinate (which are often integers). 395; * 396; * The assembly language version of this code is actually faster than 397; * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking 398; * a trap on every FPU operation). If there is no FPU, the assembly 399; * language version of this code is over 10 times as fast as the 400; * emulated FPU. 401; */ 402; fixed 403; fmul2fixed_(long /*float*/ a, long /*float*/ b) 404; { 405 406 PUBLIC _fmul2fixed_ 407_fmul2fixed_ proc far 408 push bp 409 mov bp,sp 410a equ 6 411alo equ a 412ahi equ a+2 413b equ 10 414blo equ b 415bhi equ b+2 416 push si ; will hold ma 417 push di ; will hold mb 418 419; int e = 260 + _fixed_shift - (( 420; (((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80) 421; ) >> 7); 422 423 mov dx,[bp+ahi] 424; dfmul2fixed enters here 425fmf: mov cx,260+12 426 mov ax,[bp+bhi] 427 and ax,7f80h 428 and dx,7f80h 429 add ax,dx 430 xchg ah,al ; ror ax,7 without using cl 431 rol ax,1 432 sub cx,ax 433 push cx ; e 434 435; ulong ma = (ushort)(a >> 8) | 0x8000; 436; ulong mb = (ushort)(b >> 8) | 0x8000; 437 438 mov si,[bp+alo+1] ; unaligned 439 clear ax 440 mov di,[bp+blo+1] ; unaligned 441 or si,8000h 442 or di,8000h 443 444; ulong p1 = ma * (b & 0xff); 445 446 mov al,[bp+blo] 447 mul si 448 449; (Do this later:) 450; ulong p = ma * mb; 451 452; if ( (byte)a ) /* >16 mantissa bits */ 453 454 cmp byte ptr [bp+alo],0 455 je mshort 456 457; { ulong p2 = (a & 0xff) * mb; 458; p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8; 459 460 mov cx,dx 461 mov bx,ax 462 clear ax 463 mov al,[bp+alo] 464 clear dx 465 mov dl,[bp+blo] 466 mul dx 467 mov dl,ah ; dx is zero 468 add bx,cx 469 adc cx,0 470 clear ax 471 mov al,[bp+alo] 472 mul di 473 add ax,bx 474 adc dx,cx 475 476; } 477 478mshort: 479 480; else 481; p += p1 >> 8; 482 483 mov bl,ah ; set (cx,bx) = (dx,ax) >> 8 484 mov bh,dl 485 clear cx 486 mov cl,dh 487 mov ax,si 488 mul di 489 add ax,bx 490 adc dx,cx 491 492; if ( (uint)e < 32 ) /* e = -1 is possible */ 493 494 pop cx ; e 495 cmp cx,16 496 jb shr1 497 498; else if ( e >= 32 ) /* also detects a=0 or b=0 */ 499 500 cmp cx,0 501 jl eneg 502 sub cx,16 503 cmp cx,16 504 jge shr0 505 mov ax,dx 506 clear dx 507 shr ax,cl 508 jmp ex 509 510; return fixed_0; 511 512shr0: clear ax 513 clear dx 514 jmp ex 515 516; else 517; p <<= -e; 518 519 even 520eneg: neg cx 521 shl dx,cl 522 mov bx,ax 523 shl ax,cl 524 rol bx,cl 525 xor bx,ax 526 add dx,bx 527 jmp ex 528 529; p >>= e; 530 531 even 532shr1: shr ax,cl 533 mov bx,dx 534 shr dx,cl 535 ror bx,cl 536 xor bx,dx 537 add ax,bx 538 539ex: 540 541; return ((a ^ b) < 0 ? -p : p); 542 543 mov cx,[bp+ahi] 544 xor cx,[bp+bhi] 545 jge pos 546 neg dx 547 neg ax 548 sbb dx,0 549pos: 550 551; } 552 553retu: pop di 554 pop si 555 mov sp,bp 556 pop bp 557 ret 558 559_fmul2fixed_ ENDP 560 561; The same routine with the first argument a double rather than a float. 562; The argument is split into two pieces to reduce data movement. 563 564 PUBLIC _dfmul2fixed_ 565_dfmul2fixed_ proc far 566 push bp 567 mov bp,sp 568xalo equ 6 569;b equ 10 570xahi equ 14 571 push si ; overlap this below 572 push di ; ditto 573 574; Shuffle the arguments and then use fmul2fixed. 575 576; Squeeze 3 exponent bits out of the top 35 bits of a. 577 578 mov dx,[bp+xahi+2] 579 mov bx,0c000h 580 mov ax,[bp+xahi] 581 and bx,dx 582 mov cx,[bp+xalo+2] 583 and dx,7ffh ; get rid of discarded bits 584 add cx,cx ; faster than shl! 585 jz cz ; detect common case 586 adc ax,ax ; faster than rcl! 587 adc dx,dx 588 add cx,cx 589 adc ax,ax 590 adc dx,dx 591 add cx,cx 592 adc ax,ax 593 mov [bp+alo],ax 594 adc dx,dx 595 or dx,bx 596 mov [bp+ahi],dx 597 jmp fmf 598 even 599cz: adc ax,ax 600 adc dx,dx 601 add ax,ax 602 adc dx,dx 603 add ax,ax 604 mov [bp+alo],ax 605 adc dx,dx 606 or dx,bx 607 mov [bp+ahi],dx 608 jmp fmf 609 610_dfmul2fixed_ ENDP 611 612 endif ; NOFPU 613 614 615; Transpose an 8x8 bit matrix. See gsmisc.c for the algorithm in C. 616 PUBLIC _memflip8x8 617_memflip8x8 proc far 618 push ds 619 push si 620 push di 621 ; After pushing, the offsets of the parameters are: 622 ; byte *inp=10, int line_size=14, byte *outp=16, int dist=20. 623 mov si,sp 624 mov di,ss:[si+14] ; line_size 625 lds si,ss:[si+10] ; inp 626 ; We assign variables to registers as follows: 627 ; ax = AE, bx = BF, cx (or di) = CG, dx = DH. 628 ; Load the input data. Initially we assign 629 ; ax = AB, bx = EF, cx (or di) = CD, dx = GH. 630 mov ah,[si] 631iload macro reg 632 add si,di 633 mov reg,[si] 634 endm 635 iload al 636 iload ch 637 iload cl 638 iload bh 639 iload bl 640 iload dh 641 iload dl 642 ; Transposition macro, see C code for explanation. 643trans macro reg1,reg2,shift,mask 644 mov si,reg1 645 shr si,shift 646 xor si,reg2 647 and si,mask 648 xor reg2,si 649 shl si,shift 650 xor reg1,si 651 endm 652 ; Do 4x4 transpositions 653 mov di,cx ; we need cl for the shift count 654 mov cl,4 655 trans bx,ax,cl,0f0fh 656 trans dx,di,cl,0f0fh 657 ; Swap B/E, D/G 658 xchg al,bh 659 mov cx,di 660 xchg cl,dh 661 ; Do 2x2 transpositions 662 mov di,cx ; need cl again 663 mov cl,2 664 trans di,ax,cl,3333h 665 trans dx,bx,cl,3333h 666 mov cx,di ; done shifting >1 667 ; Do 1x1 transpositions 668 trans bx,ax,1,5555h 669 trans dx,cx,1,5555h 670 ; Store result 671 mov si,sp 672 mov di,ss:[si+20] ; dist 673 lds si,ss:[si+16] ; outp 674 mov [si],ah 675istore macro reg 676 add si,di 677 mov [si],reg 678 endm 679 istore bh 680 istore ch 681 istore dh 682 istore al 683 istore bl 684 istore cl 685 istore dl 686 ; All done 687 pop di 688 pop si 689 pop ds 690 ret 691_memflip8x8 ENDP 692 693 694utilasm_TEXT ENDS 695 END 696