;
; draw a triangle
;
; input:
; 	the global registers "start", "right", and "left"
; These point to 3 points of the triangle, stored in GPU RAM
; as 6 longs each: X,Y,Z,I,U,V
; (I = intensity, U = source texture X, V = source texture Y)
; Each number is a 16.16 fixed point value.
; The triangle is assumed to be clockwise when walked in the order
; "start", then "right", then "left".
;
; 	the global registers "polyfunc" and "trapfunc" point
; to the functions to call per-triangle and per-trapezoid;
; i.e. "polyfunc" sets up registers (such as B_ZINC and B_IINC)
; which are constant for the whole triangle; "trapfunc" does
; the actual scan conversion
;

;
; registers used

; values for point A
Ax		.equr	r4,1
Ay		.equr	r5,1
Az		.equr	r6,1
Ai		.equr	r7,1
Au		.equr	r8,1
Av		.equr	r9,1

; values for point B
Bx		.equr	r10,1
By		.equr	r11,1
Bz		.equr	r12,1
Bi		.equr	r13,1
Bu		.equr	r14,1
Bv		.equr	r15,1

; values for point C
Cx		.equr	r16,1
Cy		.equr	r17,1
Cz		.equr	r18,1
Ci		.equr	r19,1
Cu		.equr	r20,1
Cv		.equr	r21,1

gaddr		.equr	r22,1		; G_FLAGS register

	.globl	dotri
dotri:

;
; Step 1: sort the points so that point A is the
; uppermost point, B is the right hand point, and
; C is the left hand point
;
; to do this, we only need the Y values for the
; points, and a pointer to the points; we can
; defer the actual loading of the points until
; we know the order

; temporary re-use of registers
Aptr		.equr	Av
Bptr		.equr	Bv
Cptr		.equr	Cv

;
; start by assuming that the "start" point is
; uppermost
;
	move	start,Aptr
	move	right,Bptr
	addqt	#4,Aptr			; point at Y component of structure
	addqt	#4,Bptr			; ditto
	load	(Aptr),Ay
	load	(Bptr),By
	subqt	#4,Aptr
	subqt	#4,Bptr
	move	left,Cptr
	addqt	#4,Cptr			; and another Y component
	load	(Cptr),Cy
	subqt	#4,Cptr

	; strip fractional bits from the Y values
	; (only integral Y values matter)

	shrq	#16,Ay
	shrq	#16,By
	shlq	#16,Ay
	shlq	#16,By
	shrq	#16,Cy
	shlq	#16,Cy

	; rename A,B,C so that A is at the top
	; this needs to happen only if (Ay > By) || (Ay >= Cy)
	;  i.e. if Ay-By <=0 and Ay-Cy < 0 then A is already on top

	move	Ay,temp2
	move	Ay,temp1
	sub	By,temp2
	sub	Cy,temp1
	subq	#1,temp2
	move	Aptr,temp0
	and	temp1,temp2	; see if both temp0 and temp1 have negative bits set
	move	Ay,temp1
	jr	MI,.donerot
	; NO NOP: cmp is a branch optimization

	; if (By > Cy) rotate counter clockwise
	; i.e. if (Cy - By) >= 0 skip this
	cmp	By,Cy
	jr	PL,.clockwise
	move	Cy,Ay			;**branch optimization
		move	Cptr,Aptr
		move	By,Cy
		move	Bptr,Cptr
		move	temp1,By
		jr	.donerot
		move	temp0,Bptr
.clockwise:
	move	Bptr,Aptr
	move	By,Ay
	move	Cptr,Bptr
	move	Cy,By
	move	temp0,Cptr		; temp0 has the old value of Aptr
	move	temp1,Cy		; temp1 has the old value of Ay
.donerot:

; here the points are sorted properly
; now load them from GPU RAM

	load	(Aptr),Ax
	load	(Bptr),Bx
	addqt	#8,Aptr			; skip y, we already have it
	addqt	#8,Bptr
	load	(Aptr),Az
	load	(Bptr),Bz
	addqt	#4,Aptr
	addqt	#4,Bptr
	load	(Aptr),Ai
	load	(Bptr),Bi
.if TEXTURES
	addqt	#4,Aptr
	addqt	#4,Bptr
	load	(Aptr),Au
	load	(Bptr),Bu
	addqt	#4,Aptr
	addqt	#4,Bptr
	load	(Aptr),Av		; these instructions trash Aptr and Bptr!
	load	(Bptr),Bv
.endif

	move	Cptr,temp0
	addqt	#8,Cptr			; skip y, we already have it
	load	(temp0),Cx
	load	(Cptr),Cz
	addqt	#12,temp0
	addqt	#8,Cptr
	load	(temp0),Ci
.if TEXTURES
	load	(Cptr),Cu
	addqt	#4,Cptr
	load	(Cptr),Cv		; this instruction trashes Cptr
.endif

;
; save some values which will be used when stepping
; through the scan lines
; (after this we'll have some registers free!)
;
	moveta	Ax,leftx
	moveta	Ay,ay
	moveta	Ax,rightx
	moveta	Az,leftz
	moveta	Ai,lefti
.if TEXTURES
	moveta	Au,leftu
	moveta	Av,leftv
.endif

;*******************************************************************
; Calculate various increments that are constant across the whole
; triangle.
; These include the Z,I,U, and V increments.
; See Graphics Gems I p. 361 ("Scanline Depth Gradient of a
; Z-Buffered Triangle") for the derivation of the formula
;    dz/dx = zinc/xinc
; where
;    zinc = (z2-z1)(y3-y1) - (z3-z1)(y2-y1)
;    xinc = (x2-x1)(y3-y1) - (x3-x1)(y2-y1)
; Similar formulae hold for I, U, and V.
;*******************************************************************

i1	.equr	Ax		; temporary registers for zinc calculation
i2	.equr	Ay
y1	.equr	Az		; confusing name: this is actually y2-y1
y2	.equr	Ai		; again, this is actually y3-y1
xinc	.equr	Au		; will actually hold mantissa of 1/xinc
xnorm	.equr	Av		; will actually hold exponent of 1/xinc

;
;
;	y2 = (pgon->pt[i+2].y - pgon->pt[0].y);
;	y1 = -(pgon->pt[i+1].y - pgon->pt[0].y);
;	ynorm = inormi2(y1, y2);
;	y1 = y1 >> ynorm;
;	y2 = y2 >> ynorm;

	movefa	ay,y1
	movefa	ay,y2
	sub	By,y1
	sub	Cy,y2
	neg	y1
	move	y2,temp2
	move	y1,temp1
	abs	temp2
	abs	temp1
	or	temp2,temp1
	normi	temp1,temp1
	addqt	#10,temp1		; find normalizer to 14 bits
	sha	temp1,y1
	sha	temp1,y2

;	i2 = (pgon->pt[i+2].x - pgon->pt[0].x);
;	i1 = (pgon->pt[i+1].x - pgon->pt[0].x);
;	xnorm = inormi2(i1, i2);
;	i1 = i1 >> xnorm;
;	i2 = i2 >> xnorm;
;	xinc = (i1*y2 + i2*y1);

	movefa	leftx,i1
	movefa	leftx,i2
	sub	Bx,i1
	sub	Cx,i2
	move	i1,temp1
	move	i2,temp2
	abs	temp1
	abs	temp2
	or	temp2,temp1
	normi	temp1,xnorm
		moveq	#1,xinc
	addqt	#10,xnorm		; find normalizer to 14 bits
		shlq	#28,xinc
	sha	xnorm,i1
	sha	xnorm,i2
	imultn	i1,y2
	imacn	i2,y1
	resmac	temp1

.if 0
;
; do backface culling: if temp1 <= 0, the polygon is facing away from us
;
	cmpq	#1,temp1
	movei	#donedraw,temp0
	jump	MI,(temp0)
.endif

;
; set xinc to 1/xinc, so we can multiply by it
; rather than having to divide all the time
;
	div	temp1,xinc

calczinc:
;
; calculate Z increment
; start the calculations while the divide for
; xinc is going on
;
	movefa	leftz,i1
	movefa	leftz,i2
	sub	Bz,i1
	sub	Cz,i2
	move	i1,temp1
	move	i2,temp2
	abs	temp1
	abs	temp2
	or	temp2,temp1
	normi	temp1,temp1
	addqt	#10,temp1		; find normalizer to 14 bits
	sha	temp1,i1
	sha	temp1,i2
	imultn	i1,y2
	imacn	i2,y1
	resmac	temp0

;
; normalize 1/xinc (actually, we have 1/(xinc >> xnorm) = (1/xinc) << xnorm
;

	normi	xinc,temp2
	addqt	#10,temp2
	sha	temp2,xinc		; xinc is now a 14 bit fraction
	sub	temp2,xnorm
;
; now find temp0 * 1/xinc
	neg	temp1
	addqt	#14,xnorm
	sharq	#14,temp0
	add	xnorm,temp1
	imult	xinc,temp0
	sha	temp1,temp0

	moveta	temp0,b_zinc

calciinc:
;
; calculate I increment
;
	movefa	lefti,i1
	movefa	lefti,i2
	sub	Bi,i1
	sub	Ci,i2
	move	i1,temp1
	move	i2,temp2
	abs	temp1
	abs	temp2
	or	temp2,temp1
	normi	temp1,temp1
	addqt	#10,temp1		; find normalizer to 14 bits
	sha	temp1,i1
	sha	temp1,i2
	imultn	i1,y2
	imacn	i2,y1
	resmac	temp0

;
; now find temp0 * 1/xinc
;
	neg	temp1
	sharq	#14,temp0
	add	xnorm,temp1
	imult	xinc,temp0
	sha	temp1,temp0
	moveta	temp0,b_iinc

.if TEXTURES
calcuinc:
;
; calculate U increment
;
	movefa	leftu,i1
	movefa	leftu,i2
	sub	Bu,i1
	sub	Cu,i2
	move	i1,temp1
	move	i2,temp2
	abs	temp1
	abs	temp2
	or	temp2,temp1
	normi	temp1,temp1
	addqt	#10,temp1		; find normalizer to 14 bits
	sha	temp1,i1
	sha	temp1,i2
	imultn	i1,y2
	imacn	i2,y1
	resmac	temp0

;
; now find temp0 * 1/xinc
	neg	temp1
	sharq	#14,temp0
	add	xnorm,temp1
	imult	xinc,temp0
	sha	temp1,temp0

	moveta	temp0,b_uinc

calcvinc:
;
; calculate V increment
;
	movefa	leftv,i1
	movefa	leftv,i2
	sub	Bv,i1
	sub	Cv,i2
	move	i1,temp1
	move	i2,temp2
	abs	temp1
	abs	temp2
	or	temp2,temp1
	normi	temp1,temp1
	addqt	#10,temp1		; find normalizer to 14 bits
	sha	temp1,i1
	sha	temp1,i2
	imultn	i1,y2
	imacn	i2,y1
	resmac	temp0

;
; now find temp0 * 1/xinc
	neg	temp1
	sharq	#14,temp0
	add	xnorm,temp1
	imult	xinc,temp0
	sha	temp1,temp0

	moveta	temp0,b_vinc
.endif ; TEXTURES



;*******************************************************************
; Call the per-polygon setup function to initialize various blitter
; registers (e.g. B_ZINC, B_IINC, A1_INC, A1_FINC)
;*******************************************************************

	move	PC,return
	jump	(polyfunc)
	addqt	#6,return

;*******************************************************************
; Calculate right side step values
; at present, only 1 right side step value is needed (for the X
; value), and so we just divide delta X by the number of lines.
;*******************************************************************

	movefa	ay,temp0		; calculate the number of lines
	move	By,temp1
	sharq	#16,temp0
	sharq	#16,temp1
	sub	temp0,temp1		; temp1 = # of lines
	move	Bx,temp2
	movefa	rightx,temp0
	shlq	#16,temp1
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posright
	div	temp1,temp2
		neg	temp2
.posright:
	moveta	temp2,rightxstep

;*******************************************************************
; Calculate left side step values
; We need step values for X, Z, I, U, and V. This would be 5
; (expensive) divides by the number of lines; it is somewhat
; cheaper to calculate 1/numlines, and then multiply by this.
; 1/numlines, which is stored as a 15 bit fraction in "xinc" with an
; exponent in "xnorm"
;*******************************************************************
;
	movefa	ay,temp0		; find the number of lines
	move	Cy,temp1		; on the left side
	sharq	#16,temp0
	sharq	#16,temp1
	sub	temp0,temp1		; temp1 = # of lines
	shlq	#16,temp1
;
; calculate left X step
;
	move	Cx,temp2
	movefa	leftx,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftx
	div	temp1,temp2
		neg	temp2
.posleftx:
	movei	#G_FLAGS,gaddr			; put this instruction here so it executes during the divide
	moveta	temp2,leftxstep

;
; calculate left Z step
;
	move	Cz,temp2
	movefa	leftz,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftz
	div	temp1,temp2
		neg	temp2
.posleftz:
	moveta	temp2,leftzstep

;
; calculate left I step
;
	move	Ci,temp2
	movefa	lefti,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.poslefti
	div	temp1,temp2
		neg	temp2
.poslefti:
	moveta	temp2,leftistep

.if TEXTURES
;
; calculate left U step
;
	move	Cu,temp2
	movefa	leftu,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftu
	div	temp1,temp2
		neg	temp2
.posleftu:
	moveta	temp2,leftustep

;
; calculate left V step
;
	move	Cv,temp2
	movefa	leftv,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftv
	div	temp1,temp2
		neg	temp2
.posleftv:
	moveta	temp2,leftvstep
.endif

	.equrundef	ynorm

;*******************************************************************
; Prepare to scan convert the triangle, in two pieces
; (with flat tops and bottoms)
;
; at this point there are two possibilities:
;    (1)   A		or (2)	A
;	C			 B
;	  B		      C
; For both cases we need to draw 2 trapezoids.
; in case (1) we need to recalculate the left step
; values once, in case (2) the right step values
; once.
;
; case 1 holds if By > Cy, i.e. if Cy - By < 0
;
;*******************************************************************

	cmp	By,Cy
	movei	#tricase2,temp0
	jump	PL,(temp0)
	nop
;
; case 1: first trapezoid
;
	movefa	ay,temp0			; # of lines to draw is int(Ay-Cy)
	move	Cy,temp1
	shrq	#16,temp0
	shrq	#16,temp1
	sub	temp0,temp1
	jr	EQ,.notrap1
	moveta	temp1,anumlines
		load	(gaddr),temp0
.call1:
		move	PC,return		; draw the trapezoid
		bclr	#14,temp0		; the scan line renderer wants to be in bank 0
		addqt	#(.notrap1-.call1),return
		jump	(trapfunc)
		store	temp0,(gaddr)		; switch banks while the jump is flushing the pre-fetch queue
.notrap1:
;
; case 1: second trapezoid
; recalculate left step values here
;
	moveta	Cx,leftx
	moveta	Cy,ay
	moveta	Cz,leftz
	moveta	Ci,lefti
.if TEXTURES
	moveta	Cu,leftu
	moveta	Cv,leftv
.endif
	movefa	ay,temp0
	move	By,temp1
	sharq	#16,temp0
	sharq	#16,temp1
	sub	temp0,temp1		; temp1 = # of lines
	moveta	temp1,anumlines
	shlq	#16,temp1
;
; calculate left X step
;
	move	Bx,temp2
	movefa	leftx,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftx2
	div	temp1,temp2
		neg	temp2
.posleftx2:
	moveta	temp2,leftxstep

;
; calculate left Z step
;
	move	Bz,temp2
	movefa	leftz,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftz2
	div	temp1,temp2
		neg	temp2
.posleftz2:
	moveta	temp2,leftzstep

;
; calculate left I step
;
	move	Bi,temp2
	movefa	lefti,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.poslefti2
	div	temp1,temp2
		neg	temp2
.poslefti2:
	moveta	temp2,leftistep

.if TEXTURES
;
; calculate left U step
;
	move	Bu,temp2
	movefa	leftu,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftu2
	div	temp1,temp2
		neg	temp2
.posleftu2:
	moveta	temp2,leftustep

;
; calculate left V step
;
	move	Bv,temp2
	movefa	leftv,temp0
	sub	temp0,temp2
	abs	temp2
	jr	CC,.posleftv2
	div	temp1,temp2
		neg	temp2
.posleftv2:
	moveta	temp2,leftvstep
.endif

; case 1, second trapezoid:
; draw the second trapezoid, then go to the end
;
	load	(gaddr),temp0
	movei	#donetri,return
	bclr	#14,temp0
	jump	(trapfunc)
	store	temp0,(gaddr)		; switch to register bank 0

tricase2:
;
; case 2, first trapezoid:
;
	movefa	ay,temp0			; # of lines to draw is int(Ay-By)
	move	By,temp1
	shrq	#16,temp0
	shrq	#16,temp1
	sub	temp0,temp1
	jr	EQ,.notrap1
	moveta	temp1,anumlines
		load	(gaddr),temp0
.call2:
		move	PC,return		; draw the trapezoid
		bclr	#14,temp0
		addqt	#(.notrap1-.call2),return
		jump	(trapfunc)
		store	temp0,(gaddr)
.notrap1:
;
; case 2, second trapezoid:
; recalculate right side step values
;
	move	By,temp0
	move	Cy,temp1
	sharq	#16,temp0
	sharq	#16,temp1
	sub	temp0,temp1		; temp1 = # of lines = int(Cy)-int(By)
	move	Cx,temp2
	moveta	temp1,anumlines
	sub	Bx,temp2
	shlq	#16,temp1
	abs	temp2
	jr	CC,.posright
	div	temp1,temp2
		neg	temp2
.posright:
	moveta	Bx,rightx
	moveta	temp2,rightxstep

; case 2, second trapezoid:
; draw the trapezoid
	load	(gaddr),temp0
.call3:
	move	PC,return		; draw the trapezoid
	bclr	#14,temp0
	addqt	#(donetri-.call3),return
	jump	(trapfunc)
	store	temp0,(gaddr)		; switch banks while the jump is happening

; ALL DONE!
donetri:

	.equrundef	y1
	.equrundef	y2
	.equrundef	i1
	.equrundef	i2
	.equrundef	xnorm
	.equrundef	xinc

	.equrundef	Ax
	.equrundef	Ay
	.equrundef	Az
	.equrundef	Ai
	.equrundef	Au
	.equrundef	Av
	.equrundef	Aptr

	.equrundef	Bx
	.equrundef	By
	.equrundef	Bz
	.equrundef	Bi
	.equrundef	Bu
	.equrundef	Bv
	.equrundef	Bptr

	.equrundef	Cx
	.equrundef	Cy
	.equrundef	Cz
	.equrundef	Ci
	.equrundef	Cu
	.equrundef	Cv
	.equrundef	Cptr

	.equrundef	gaddr
