; Copyright (C) 1989, 1992, 1993 Aladdin Enterprises. All rights reserved. ; ; This software is provided AS-IS with no warranty, either express or ; implied. ; ; This software is distributed under license and may not be copied, ; modified or distributed except as expressly authorized under the terms ; of the license contained in the file LICENSE in this distribution. ; ; For more information about licensing, please refer to ; http://www.ghostscript.com/licensing/. For information on ; commercial licensing, go to http://www.artifex.com/licensing/ or ; contact Artifex Software, Inc., 101 Lucas Valley Road #110, ; San Rafael, CA 94903, U.S.A., +1(415)492-9861. ; $Id$ ; iutilasm.asm ; Assembly code for Ghostscript interpreter on MS-DOS systems ifdef FOR80386 .286c endif utilasm_TEXT SEGMENT WORD PUBLIC 'CODE' ASSUME CS:utilasm_TEXT ifdef FOR80386 ; Macro for 32-bit operand prefix. OP32 macro db 66h endm endif ; FOR80386 ; Clear a register clear macro reg xor reg,reg endm ifdef FOR80386 ; Replace the multiply and divide routines in the Turbo C library ; if we are running on an 80386. ; Macro to swap the halves of a 32-bit register. ; Unfortunately, masm won't allow a shift instruction with a count of 16, ; so we have to code it in hex. swap macro regno OP32 db 0c1h,0c0h+regno,16 ; rol regno,16 endm regax equ 0 regcx equ 1 regdx equ 2 regbx equ 3 ; Multiply (dx,ax) by (cx,bx) to (dx,ax). PUBLIC LXMUL@ PUBLIC F_LXMUL@ F_LXMUL@ proc far LXMUL@ proc far swap regdx mov dx,ax swap regcx mov cx,bx OP32 db 0fh,0afh,0d1h ; imul dx,cx OP32 mov ax,dx swap regdx ret LXMUL@ endp F_LXMUL@ endp ; Divide two stack operands, leave the result in (dx,ax). ifdef DEBUG setup32 macro mov bx,sp push bp mov bp,sp OP32 mov ax,ss:[bx+4] ; dividend endm ret32 macro n mov sp,bp pop bp ret n endm else ; !DEBUG setup32 macro mov bx,sp OP32 mov ax,ss:[bx+4] ; dividend endm ret32 macro n ret n endm endif ; (!)DEBUG PUBLIC LDIV@, LUDIV@, LMOD@, LUMOD@ PUBLIC F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@ F_LDIV@ proc far LDIV@ proc far setup32 OP32 cwd OP32 idiv word ptr ss:[bx+8] ; divisor OP32 mov dx,ax swap regdx ret32 8 LDIV@ endp F_LDIV@ endp F_LUDIV@ proc far LUDIV@ proc far setup32 OP32 xor dx,dx OP32 div word ptr ss:[bx+8] ; divisor OP32 mov dx,ax swap regdx ret32 8 LUDIV@ endp F_LUDIV@ endp F_LMOD@ proc far LMOD@ proc far setup32 OP32 cwd OP32 idiv word ptr ss:[bx+8] ; divisor OP32 mov ax,dx swap regdx ret32 8 LMOD@ endp F_LMOD@ endp F_LUMOD@ proc far LUMOD@ proc far setup32 OP32 xor dx,dx OP32 div word ptr ss:[bx+8] ; divisor OP32 mov ax,dx swap regdx ret32 8 LUMOD@ endp F_LUMOD@ endp else ; !FOR80386 ; Replace the divide routines in the Turbo C library, ; which do the division one bit at a time (!). PUBLIC LDIV@, LMOD@, LUDIV@, LUMOD@ PUBLIC F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@ ; Negate a long on the stack. negbp macro offset neg word ptr [bp+offset+2] ; high part neg word ptr [bp+offset] ; low part sbb word ptr [bp+offset+2],0 endm ; Negate a long in (dx,ax). negr macro neg dx neg ax sbb dx,0 endm ; Divide two unsigned longs on the stack. ; Leave either the quotient or the remainder in (dx,ax). ; Operand offsets assume that bp (and only bp) has been pushed. nlo equ 6 nhi equ 8 dlo equ 10 dhi equ 12 ; We use an offset in bx to distinguish div from mod, ; and to indicate whether the result should be negated. odiv equ 0 omod equ 2 odivneg equ 4 omodneg equ 6 F_LMOD@ proc far LMOD@ proc far push bp mov bp,sp mov bx,omod ; Take abs of denominator cmp byte ptr [bp+dhi+1],bh ; bh = 0 jge modpd negbp dlo modpd: ; Negate mod if numerator < 0 cmp byte ptr [bp+nhi+1],bh ; bh = 0 jge udiv mov bx,omodneg negnum: negbp nlo jmp udiv LMOD@ endp F_LMOD@ endp F_LUMOD@ proc far LUMOD@ proc far mov bx,omod jmp udpush LUMOD@ endp F_LUMOD@ endp F_LDIV@ proc far LDIV@ proc far push bp mov bp,sp mov bx,odiv ; Negate quo if num^den < 0 mov ax,[bp+nhi] xor ax,[bp+dhi] jge divabs mov bx,odivneg divabs: ; Take abs of denominator cmp byte ptr [bp+dhi+1],bh ; bh = 0 jge divpd negbp dlo divpd: ; Take abs of numerator cmp byte ptr [bp+nhi+1],bh ; bh = 0 jge udiv jmp negnum LDIV@ endp F_LDIV@ endp F_LUDIV@ proc far LUDIV@ proc far mov bx,odiv udpush: push bp mov bp,sp udiv: push bx ; odiv, omod, odivneg, omodneg mov ax,[bp+nlo] mov dx,[bp+nhi] mov bx,[bp+dlo] mov cx,[bp+dhi] ; Now we are dividing dx:ax by cx:bx. ; Check to see whether this is really a 32/16 division. or cx,cx jnz div2 ; 32/16, check for 16- vs. 32-bit quotient cmp dx,bx jae div1 ; 32/16 with 16-bit quotient, just do it. div bx ; ax = quo, dx = rem pop bx pop bp jmp cs:xx1[bx] even xx1 dw offset divx1 dw offset modx1 dw offset divx1neg dw offset modx1neg modx1: mov ax,dx divx1: xor dx,dx ret 8 modx1neg: mov ax,dx divx1neg: xor dx,dx rneg: negr ret 8 ; 32/16 with 32-bit quotient, do in 2 parts. div1: mov cx,ax ; save lo num mov ax,dx xor dx,dx div bx ; ax = hi quo xchg cx,ax ; save hi quo, get lo num div bx ; ax = lo quo, dx = rem pop bx pop bp jmp cs:xx1a[bx] even xx1a dw offset divx1a dw offset modx1 dw offset divx1aneg dw offset modx1neg divx1a: mov dx,cx ; hi quo ret 8 divx1aneg: mov dx,cx jmp rneg ; This is really a 32/32 bit division. ; (Note that the quotient cannot exceed 16 bits.) ; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2 ; (first edition). ; Start by normalizing the numerator and denominator. div2: or ch,ch jz div21 ; ch == 0, but cl != 0 ; Do 8 steps all at once. mov bl,bh mov bh,cl mov cl,ch xor ch,ch mov al,ah mov ah,dl mov dl,dh xor dh,dh rol bx,1 ; faster than jmp div2a: rcr bx,1 ; finish previous shift div21: shr dx,1 rcr ax,1 shr cx,1 jnz div2a rcr bx,1 ; Now we can do a 32/16 divide. div2x: div bx ; ax = quo, dx = rem ; Multiply by the denominator, and correct the result. mov cx,ax ; save quotient mul word ptr [bp+dhi] mov bx,ax ; save lo part of hi product mov ax,cx mul word ptr [bp+dlo] add dx,bx ; Now cx = trial quotient, (dx,ax) = cx * denominator. not dx neg ax cmc adc dx,0 ; double-precision neg jc divz ; zero quotient ; requires special handling add ax,[bp+nlo] adc dx,[bp+nhi] jc divx ; Quotient is too large, adjust it. div3: dec cx add ax,[bp+dlo] adc dx,[bp+dhi] jnc div3 ; All done. (dx,ax) = remainder, cx = lo quotient. divx: pop bx pop bp jmp cs:xx3[bx] even xx3 dw offset divx3 dw offset modx3 dw offset divx3neg dw offset modx3neg divx3: mov ax,cx xor dx,dx modx3: ret 8 divx3neg: mov ax,cx xor dx,dx modx3neg: jmp rneg ; Handle zero quotient specially. divz: pop bx jmp cs:xxz[bx] even xxz dw offset divxz dw offset modxz dw offset divxz dw offset modxzneg divxz: pop bp ret 8 modxzneg: negbp nlo modxz: mov ax,[bp+nlo] mov dx,[bp+nhi] pop bp ret 8 LUDIV@ endp F_LUDIV@ endp endif ; FOR80386 ifdef NOFPU ; See gsmisc.c for the C version of this code. ; /* ; * Floating multiply with fixed result, for avoiding floating point in ; * common coordinate transformations. Assumes IEEE representation, ; * 16-bit short, 32-bit long. Optimized for the case where the first ; * operand has no more than 16 mantissa bits, e.g., where it is a user space ; * coordinate (which are often integers). ; * ; * The assembly language version of this code is actually faster than ; * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking ; * a trap on every FPU operation). If there is no FPU, the assembly ; * language version of this code is over 10 times as fast as the ; * emulated FPU. ; */ ; fixed ; fmul2fixed_(long /*float*/ a, long /*float*/ b) ; { PUBLIC _fmul2fixed_ _fmul2fixed_ proc far push bp mov bp,sp a equ 6 alo equ a ahi equ a+2 b equ 10 blo equ b bhi equ b+2 push si ; will hold ma push di ; will hold mb ; int e = 260 + _fixed_shift - (( ; (((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80) ; ) >> 7); mov dx,[bp+ahi] ; dfmul2fixed enters here fmf: mov cx,260+12 mov ax,[bp+bhi] and ax,7f80h and dx,7f80h add ax,dx xchg ah,al ; ror ax,7 without using cl rol ax,1 sub cx,ax push cx ; e ; ulong ma = (ushort)(a >> 8) | 0x8000; ; ulong mb = (ushort)(b >> 8) | 0x8000; mov si,[bp+alo+1] ; unaligned clear ax mov di,[bp+blo+1] ; unaligned or si,8000h or di,8000h ; ulong p1 = ma * (b & 0xff); mov al,[bp+blo] mul si ; (Do this later:) ; ulong p = ma * mb; ; if ( (byte)a ) /* >16 mantissa bits */ cmp byte ptr [bp+alo],0 je mshort ; { ulong p2 = (a & 0xff) * mb; ; p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8; mov cx,dx mov bx,ax clear ax mov al,[bp+alo] clear dx mov dl,[bp+blo] mul dx mov dl,ah ; dx is zero add bx,cx adc cx,0 clear ax mov al,[bp+alo] mul di add ax,bx adc dx,cx ; } mshort: ; else ; p += p1 >> 8; mov bl,ah ; set (cx,bx) = (dx,ax) >> 8 mov bh,dl clear cx mov cl,dh mov ax,si mul di add ax,bx adc dx,cx ; if ( (uint)e < 32 ) /* e = -1 is possible */ pop cx ; e cmp cx,16 jb shr1 ; else if ( e >= 32 ) /* also detects a=0 or b=0 */ cmp cx,0 jl eneg sub cx,16 cmp cx,16 jge shr0 mov ax,dx clear dx shr ax,cl jmp ex ; return fixed_0; shr0: clear ax clear dx jmp ex ; else ; p <<= -e; even eneg: neg cx shl dx,cl mov bx,ax shl ax,cl rol bx,cl xor bx,ax add dx,bx jmp ex ; p >>= e; even shr1: shr ax,cl mov bx,dx shr dx,cl ror bx,cl xor bx,dx add ax,bx ex: ; return ((a ^ b) < 0 ? -p : p); mov cx,[bp+ahi] xor cx,[bp+bhi] jge pos neg dx neg ax sbb dx,0 pos: ; } retu: pop di pop si mov sp,bp pop bp ret _fmul2fixed_ ENDP ; The same routine with the first argument a double rather than a float. ; The argument is split into two pieces to reduce data movement. PUBLIC _dfmul2fixed_ _dfmul2fixed_ proc far push bp mov bp,sp xalo equ 6 ;b equ 10 xahi equ 14 push si ; overlap this below push di ; ditto ; Shuffle the arguments and then use fmul2fixed. ; Squeeze 3 exponent bits out of the top 35 bits of a. mov dx,[bp+xahi+2] mov bx,0c000h mov ax,[bp+xahi] and bx,dx mov cx,[bp+xalo+2] and dx,7ffh ; get rid of discarded bits add cx,cx ; faster than shl! jz cz ; detect common case adc ax,ax ; faster than rcl! adc dx,dx add cx,cx adc ax,ax adc dx,dx add cx,cx adc ax,ax mov [bp+alo],ax adc dx,dx or dx,bx mov [bp+ahi],dx jmp fmf even cz: adc ax,ax adc dx,dx add ax,ax adc dx,dx add ax,ax mov [bp+alo],ax adc dx,dx or dx,bx mov [bp+ahi],dx jmp fmf _dfmul2fixed_ ENDP endif ; NOFPU ; Transpose an 8x8 bit matrix. See gsmisc.c for the algorithm in C. PUBLIC _memflip8x8 _memflip8x8 proc far push ds push si push di ; After pushing, the offsets of the parameters are: ; byte *inp=10, int line_size=14, byte *outp=16, int dist=20. mov si,sp mov di,ss:[si+14] ; line_size lds si,ss:[si+10] ; inp ; We assign variables to registers as follows: ; ax = AE, bx = BF, cx (or di) = CG, dx = DH. ; Load the input data. Initially we assign ; ax = AB, bx = EF, cx (or di) = CD, dx = GH. mov ah,[si] iload macro reg add si,di mov reg,[si] endm iload al iload ch iload cl iload bh iload bl iload dh iload dl ; Transposition macro, see C code for explanation. trans macro reg1,reg2,shift,mask mov si,reg1 shr si,shift xor si,reg2 and si,mask xor reg2,si shl si,shift xor reg1,si endm ; Do 4x4 transpositions mov di,cx ; we need cl for the shift count mov cl,4 trans bx,ax,cl,0f0fh trans dx,di,cl,0f0fh ; Swap B/E, D/G xchg al,bh mov cx,di xchg cl,dh ; Do 2x2 transpositions mov di,cx ; need cl again mov cl,2 trans di,ax,cl,3333h trans dx,bx,cl,3333h mov cx,di ; done shifting >1 ; Do 1x1 transpositions trans bx,ax,1,5555h trans dx,cx,1,5555h ; Store result mov si,sp mov di,ss:[si+20] ; dist lds si,ss:[si+16] ; outp mov [si],ah istore macro reg add si,di mov [si],reg endm istore bh istore ch istore dh istore al istore bl istore cl istore dl ; All done pop di pop si pop ds ret _memflip8x8 ENDP utilasm_TEXT ENDS END