sys/src/cmd/gs/src/iutilasm.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695

;    Copyright (C) 1989, 1992, 1993 Aladdin Enterprises.  All rights reserved.
; 
; This software is provided AS-IS with no warranty, either express or
; implied.
; 
; This software is distributed under license and may not be copied,
; modified or distributed except as expressly authorized under the terms
; of the license contained in the file LICENSE in this distribution.
; 
; For more information about licensing, please refer to
; http://www.ghostscript.com/licensing/. For information on
; commercial licensing, go to http://www.artifex.com/licensing/ or
; contact Artifex Software, Inc., 101 Lucas Valley Road #110,
; San Rafael, CA  94903, U.S.A., +1(415)492-9861.

; $Id: iutilasm.asm,v 1.4 2002/02/21 22:24:53 giles Exp $
; iutilasm.asm
; Assembly code for Ghostscript interpreter on MS-DOS systems

	ifdef	FOR80386

	.286c

	endif

utilasm_TEXT	SEGMENT	WORD PUBLIC 'CODE'
	ASSUME	CS:utilasm_TEXT


	ifdef	FOR80386

; Macro for 32-bit operand prefix.
OP32	macro
	db	66h
	endm

	endif					; FOR80386

; Clear a register

clear	macro	reg
	xor	reg,reg
	endm


	ifdef	FOR80386

; Replace the multiply and divide routines in the Turbo C library
; if we are running on an 80386.

; Macro to swap the halves of a 32-bit register.
; Unfortunately, masm won't allow a shift instruction with a count of 16,
; so we have to code it in hex.
swap	macro	regno
	  OP32
	db	0c1h,0c0h+regno,16		; rol regno,16
	endm
regax	equ	0
regcx	equ	1
regdx	equ	2
regbx	equ	3


; Multiply (dx,ax) by (cx,bx) to (dx,ax).
	PUBLIC	LXMUL@
	PUBLIC	F_LXMUL@
F_LXMUL@ proc	far
LXMUL@	proc	far
	swap	regdx
	mov	dx,ax
	swap	regcx
	mov	cx,bx
	  OP32
	db	0fh,0afh,0d1h			; imul dx,cx
	  OP32
	mov	ax,dx
	swap	regdx
	ret
LXMUL@	endp
F_LXMUL@ endp


; Divide two stack operands, leave the result in (dx,ax).

	ifdef	DEBUG

setup32	macro
	mov	bx,sp
	push	bp
	mov	bp,sp
	  OP32
	mov	ax,ss:[bx+4]			; dividend
	endm

ret32	macro	n
	mov	sp,bp
	pop	bp
	ret	n
	endm

	else					; !DEBUG

setup32	macro
	mov	bx,sp
	  OP32
	mov	ax,ss:[bx+4]			; dividend
	endm

ret32	macro	n
	ret	n
	endm

	endif					; (!)DEBUG

	PUBLIC	LDIV@, LUDIV@, LMOD@, LUMOD@
	PUBLIC	F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@
F_LDIV@	proc	far
LDIV@	proc	far
	setup32
	  OP32
	cwd
	  OP32
	idiv	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	dx,ax
	swap	regdx
	ret32	8
LDIV@	endp
F_LDIV@	endp
F_LUDIV@ proc	far
LUDIV@	proc	far
	setup32
	  OP32
	xor	dx,dx
	  OP32
	div	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	dx,ax
	swap	regdx
	ret32	8
LUDIV@	endp
F_LUDIV@ endp
F_LMOD@	proc	far
LMOD@	proc	far
	setup32
	  OP32
	cwd
	  OP32
	idiv	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	ax,dx
	swap	regdx
	ret32	8
LMOD@	endp
F_LMOD@	endp
F_LUMOD@ proc	far
LUMOD@	proc	far
	setup32
	  OP32
	xor	dx,dx
	  OP32
	div	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	ax,dx
	swap	regdx
	ret32	8
LUMOD@	endp
F_LUMOD@ endp

	else					; !FOR80386

; Replace the divide routines in the Turbo C library,
; which do the division one bit at a time (!).

	PUBLIC	LDIV@, LMOD@, LUDIV@, LUMOD@
	PUBLIC	F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@

; Negate a long on the stack.
negbp	macro	offset
	neg	word ptr [bp+offset+2]		; high part
	neg	word ptr [bp+offset]		; low part
	sbb	word ptr [bp+offset+2],0
	endm

; Negate a long in (dx,ax).
negr	macro
	neg	dx
	neg	ax
	sbb	dx,0
	endm

; Divide two unsigned longs on the stack.
; Leave either the quotient or the remainder in (dx,ax).
; Operand offsets assume that bp (and only bp) has been pushed.
nlo	equ	6
nhi	equ	8
dlo	equ	10
dhi	equ	12

; We use an offset in bx to distinguish div from mod,
; and to indicate whether the result should be negated.
odiv	equ	0
omod	equ	2
odivneg	equ	4
omodneg	equ	6
F_LMOD@	proc	far
LMOD@	proc	far
	push	bp
	mov	bp,sp
	mov	bx,omod
			; Take abs of denominator
	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
	jge	modpd
	negbp	dlo
modpd:			; Negate mod if numerator < 0
	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
	jge	udiv
	mov	bx,omodneg
negnum:	negbp	nlo
	jmp	udiv
LMOD@	endp
F_LMOD@	endp
F_LUMOD@ proc	far
LUMOD@	proc	far
	mov	bx,omod
	jmp	udpush
LUMOD@	endp
F_LUMOD@ endp
F_LDIV@	proc	far
LDIV@	proc	far
	push	bp
	mov	bp,sp
	mov	bx,odiv
			; Negate quo if num^den < 0
	mov	ax,[bp+nhi]
	xor	ax,[bp+dhi]
	jge	divabs
	mov	bx,odivneg
divabs:			; Take abs of denominator
	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
	jge	divpd
	negbp	dlo
divpd:			; Take abs of numerator
	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
	jge	udiv
	jmp	negnum
LDIV@	endp
F_LDIV@	endp
F_LUDIV@ proc	far
LUDIV@	proc	far
	mov	bx,odiv
udpush:	push	bp
	mov	bp,sp
udiv:	push	bx				; odiv, omod, odivneg, omodneg
	mov	ax,[bp+nlo]
	mov	dx,[bp+nhi]
	mov	bx,[bp+dlo]
	mov	cx,[bp+dhi]
; Now we are dividing dx:ax by cx:bx.
; Check to see whether this is really a 32/16 division.
	or	cx,cx
	jnz	div2
; 32/16, check for 16- vs. 32-bit quotient
	cmp	dx,bx
	jae	div1
; 32/16 with 16-bit quotient, just do it.
	div	bx				; ax = quo, dx = rem
	pop	bx
	pop	bp
	jmp	cs:xx1[bx]
	even
xx1	dw	offset divx1
	dw	offset modx1
	dw	offset divx1neg
	dw	offset modx1neg
modx1:	mov	ax,dx
divx1:	xor	dx,dx
	ret	8
modx1neg: mov	ax,dx
divx1neg: xor	dx,dx
rneg:	negr
	ret	8
; 32/16 with 32-bit quotient, do in 2 parts.
div1:	mov	cx,ax				; save lo num
	mov	ax,dx
	xor	dx,dx
	div	bx				; ax = hi quo
	xchg	cx,ax				; save hi quo, get lo num
	div	bx				; ax = lo quo, dx = rem
	pop	bx
	pop	bp
	jmp	cs:xx1a[bx]
	even
xx1a	dw	offset divx1a
	dw	offset modx1
	dw	offset divx1aneg
	dw	offset modx1neg
divx1a:	mov	dx,cx				; hi quo
	ret	8
divx1aneg: mov	dx,cx
	jmp	rneg
; This is really a 32/32 bit division.
; (Note that the quotient cannot exceed 16 bits.)
; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2
; (first edition).
; Start by normalizing the numerator and denominator.
div2:	or	ch,ch
	jz	div21				; ch == 0, but cl != 0
; Do 8 steps all at once.
	mov	bl,bh
	mov	bh,cl
	mov	cl,ch
	xor	ch,ch
	mov	al,ah
	mov	ah,dl
	mov	dl,dh
	xor	dh,dh
	rol	bx,1				; faster than jmp
div2a:	rcr	bx,1				; finish previous shift
div21:	shr	dx,1
	rcr	ax,1
	shr	cx,1
	jnz	div2a
	rcr	bx,1
; Now we can do a 32/16 divide.
div2x:	div	bx				; ax = quo, dx = rem
; Multiply by the denominator, and correct the result.
	mov	cx,ax				; save quotient
	mul	word ptr [bp+dhi]
	mov	bx,ax				; save lo part of hi product
	mov	ax,cx
	mul	word ptr [bp+dlo]
	add	dx,bx
; Now cx = trial quotient, (dx,ax) = cx * denominator.
	not	dx
	neg	ax
	cmc
	adc	dx,0				; double-precision neg
	jc	divz				; zero quotient
						; requires special handling
	add	ax,[bp+nlo]
	adc	dx,[bp+nhi]
	jc	divx
; Quotient is too large, adjust it.
div3:	dec	cx
	add	ax,[bp+dlo]
	adc	dx,[bp+dhi]
	jnc	div3
; All done.  (dx,ax) = remainder, cx = lo quotient.
divx:	pop	bx
	pop	bp
	jmp	cs:xx3[bx]
	even
xx3	dw	offset divx3
	dw	offset modx3
	dw	offset divx3neg
	dw	offset modx3neg
divx3:	mov	ax,cx
	xor	dx,dx
modx3:	ret	8
divx3neg: mov	ax,cx
	xor	dx,dx
modx3neg: jmp	rneg
; Handle zero quotient specially.
divz:	pop	bx
	jmp	cs:xxz[bx]
	even
xxz	dw	offset divxz
	dw	offset modxz
	dw	offset divxz
	dw	offset modxzneg
divxz:	pop	bp
	ret	8
modxzneg: negbp	nlo
modxz:	mov	ax,[bp+nlo]
	mov	dx,[bp+nhi]
	pop	bp
	ret	8
LUDIV@	endp
F_LUDIV@ endp

	endif					; FOR80386


	ifdef	NOFPU

; See gsmisc.c for the C version of this code.

; /*
;  * Floating multiply with fixed result, for avoiding floating point in
;  * common coordinate transformations.  Assumes IEEE representation,
;  * 16-bit short, 32-bit long.  Optimized for the case where the first
;  * operand has no more than 16 mantissa bits, e.g., where it is a user space
;  * coordinate (which are often integers).
;  *
;  * The assembly language version of this code is actually faster than
;  * the FPU, if the code is compiled with FPU_TYPE=0 (which requires taking
;  * a trap on every FPU operation).  If there is no FPU, the assembly
;  * language version of this code is over 10 times as fast as the
;  * emulated FPU.
;  */
; fixed
; fmul2fixed_(long /*float*/ a, long /*float*/ b)
; {

	PUBLIC	_fmul2fixed_
_fmul2fixed_ proc far
	push	bp
	mov	bp,sp
a	equ	6
alo	equ	a
ahi	equ	a+2
b	equ	10
blo	equ	b
bhi	equ	b+2
	push	si		; will hold ma
	push	di		; will hold mb

; 	int e = 260 + _fixed_shift - ((
; 		(((uint)(a >> 16)) & 0x7f80) + (((uint)(b >> 16)) & 0x7f80)
; 	  ) >> 7);

	mov	dx,[bp+ahi]
; dfmul2fixed enters here
fmf:	mov	cx,260+12
	mov	ax,[bp+bhi]
	and	ax,7f80h
	and	dx,7f80h
	add	ax,dx
	xchg	ah,al		; ror ax,7 without using cl
	rol	ax,1
	sub	cx,ax
	push	cx		; e

; 	ulong ma = (ushort)(a >> 8) | 0x8000;
; 	ulong mb = (ushort)(b >> 8) | 0x8000;

	mov	si,[bp+alo+1]	; unaligned
	clear	ax
	mov	di,[bp+blo+1]	; unaligned
	or	si,8000h
	or	di,8000h

; 	ulong p1 = ma * (b & 0xff);

	mov	al,[bp+blo]
	mul	si

;			(Do this later:)
; 	ulong p = ma * mb;

; 	if ( (byte)a )		/* >16 mantissa bits */

	cmp	byte ptr [bp+alo],0
	je	mshort

; 	{	ulong p2 = (a & 0xff) * mb;
; 		p += ((((uint)(byte)a * (uint)(byte)b) >> 8) + p1 + p2) >> 8;

	mov	cx,dx
	mov	bx,ax
	clear	ax
	mov	al,[bp+alo]
	clear	dx
	mov	dl,[bp+blo]
	mul	dx
	mov	dl,ah		; dx is zero
	add	bx,cx
	adc	cx,0
	clear	ax
	mov	al,[bp+alo]
	mul	di
	add	ax,bx
	adc	dx,cx

; 	}

mshort:

; 	else
; 		p += p1 >> 8;

	mov	bl,ah		; set (cx,bx) = (dx,ax) >> 8
	mov	bh,dl
	clear	cx
	mov	cl,dh
	mov	ax,si
	mul	di
	add	ax,bx
	adc	dx,cx

; 	if ( (uint)e < 32 )		/* e = -1 is possible */

	pop	cx		; e
	cmp	cx,16
	jb	shr1

; 	else if ( e >= 32 )		/* also detects a=0 or b=0 */

	cmp	cx,0
	jl	eneg
	sub	cx,16
	cmp	cx,16
	jge	shr0
	mov	ax,dx
	clear	dx
	shr	ax,cl
	jmp	ex

; 		return fixed_0;

shr0:	clear	ax
	clear	dx
	jmp	ex

; 	else
; 		p <<= -e;

	even
eneg:	neg	cx
	shl	dx,cl
	mov	bx,ax
	shl	ax,cl
	rol	bx,cl
	xor	bx,ax
	add	dx,bx
	jmp	ex

; 		p >>= e;

	even
shr1:	shr	ax,cl
	mov	bx,dx
	shr	dx,cl
	ror	bx,cl
	xor	bx,dx
	add	ax,bx

ex:

; 	return ((a ^ b) < 0 ? -p : p);

	mov	cx,[bp+ahi]
	xor	cx,[bp+bhi]
	jge	pos
	neg	dx
	neg	ax
	sbb	dx,0
pos:

; }

retu:	pop	di
	pop	si
	mov	sp,bp
	pop	bp
	ret

_fmul2fixed_ ENDP

; The same routine with the first argument a double rather than a float.
; The argument is split into two pieces to reduce data movement.

	PUBLIC	_dfmul2fixed_
_dfmul2fixed_ proc far
	push	bp
	mov	bp,sp
xalo	equ	6
;b	equ	10
xahi	equ	14
	push	si		; overlap this below
	push	di		; ditto

; Shuffle the arguments and then use fmul2fixed.

; Squeeze 3 exponent bits out of the top 35 bits of a.

	mov	dx,[bp+xahi+2]
	mov	bx,0c000h
	mov	ax,[bp+xahi]
	and	bx,dx
	mov	cx,[bp+xalo+2]
	and	dx,7ffh		; get rid of discarded bits
	add	cx,cx		; faster than shl!
	jz	cz		; detect common case
	adc	ax,ax		; faster than rcl!
	adc	dx,dx
	add	cx,cx
	adc	ax,ax
	adc	dx,dx
	add	cx,cx
	adc	ax,ax
	mov	[bp+alo],ax
	adc	dx,dx
	or	dx,bx
	mov	[bp+ahi],dx
	jmp	fmf
	even
cz:	adc	ax,ax
	adc	dx,dx
	add	ax,ax
	adc	dx,dx
	add	ax,ax
	mov	[bp+alo],ax
	adc	dx,dx
	or	dx,bx
	mov	[bp+ahi],dx
	jmp	fmf

_dfmul2fixed_ ENDP

	endif					; NOFPU


; Transpose an 8x8 bit matrix.  See gsmisc.c for the algorithm in C.
	PUBLIC	_memflip8x8
_memflip8x8 proc far
	push	ds
	push	si
	push	di
		; After pushing, the offsets of the parameters are:
		; byte *inp=10, int line_size=14, byte *outp=16, int dist=20.
	mov	si,sp
	mov	di,ss:[si+14]			; line_size
	lds	si,ss:[si+10]			; inp
		; We assign variables to registers as follows:
		; ax = AE, bx = BF, cx (or di) = CG, dx = DH.
		; Load the input data.  Initially we assign
		; ax = AB, bx = EF, cx (or di) = CD, dx = GH.
	mov	ah,[si]
iload	macro	reg
	add	si,di
	mov	reg,[si]
	endm
	iload	al
	iload	ch
	iload	cl
	iload	bh
	iload	bl
	iload	dh
	iload	dl
		; Transposition macro, see C code for explanation.
trans	macro	reg1,reg2,shift,mask
	mov	si,reg1
	shr	si,shift
	xor	si,reg2
	and	si,mask
	xor	reg2,si
	shl	si,shift
	xor	reg1,si
	endm
		; Do 4x4 transpositions
	mov	di,cx			; we need cl for the shift count
	mov	cl,4
	trans	bx,ax,cl,0f0fh
	trans	dx,di,cl,0f0fh
		; Swap B/E, D/G
	xchg	al,bh
	mov	cx,di
	xchg	cl,dh
		; Do 2x2 transpositions
	mov	di,cx				; need cl again
	mov	cl,2
	trans	di,ax,cl,3333h
	trans	dx,bx,cl,3333h
	mov	cx,di				; done shifting >1
		; Do 1x1 transpositions
	trans	bx,ax,1,5555h
	trans	dx,cx,1,5555h
		; Store result
	mov	si,sp
	mov	di,ss:[si+20]			; dist
	lds	si,ss:[si+16]			; outp
	mov	[si],ah
istore	macro	reg
	add	si,di
	mov	[si],reg
	endm
	istore	bh
	istore	ch
	istore	dh
	istore	al
	istore	bl
	istore	cl
	istore	dl
		; All done
	pop	di
	pop	si
	pop	ds
	ret
_memflip8x8 ENDP


utilasm_TEXT ENDS
	END