		.include		"jaguar.inc"

	.globl 		_gpu_start
	.globl 		_gpu_end
	.globl 		_DoGpuBlit
	.globl 		BlitDone

	.globl 		_Blit
	.globl 		_InitGpuCode

	.globl		_BlitAddIndex
	.globl		_BlitUseIndex

;	.extern		_WaitForBlitter

;;; Equates
INDEX_BIT_COUNT		.equ	13
LENGTH_BIT_COUNT	.equ	4
BREAK_EVEN			.equ	((1+INDEX_BIT_COUNT+LENGTH_BIT_COUNT)/9)		

	.68000
	.data
	.phrase

_InitGpuCode:
	move.l	B_CMD,d0
	btst.l	#0,d0
	beq		_InitGpuCode
;	jsr		_WaitForBlitter

	lea		_gpu_start, a0
	move.l	#_gpu_end,	d0
	sub.l	a0,	d0
	lsr.l	#2,	d0

; The destination pointer is the GPU RAM Address + $8000 for 32-bit	copies
	move.l	#_DoGpuBlit+$8000,A1_BASE	
	move.l	a0,A2_BASE	; Source in	ROM	from above

	move.l	#PITCH1|PIXEL32|XADDPHR,A1_FLAGS
	move.l	#PITCH1|PIXEL32|XADDPHR,A2_FLAGS

	move.l	#0,A1_CLIP	; Required for Blitter Bug

	move.l	#0,A1_PIXEL
	move.l	#0,A2_PIXEL

	or.l	#$00010000,d0	; One outer	loop
	move.l	d0,B_COUNT
	move.l	#SRCEN|LFU_REPLACE,B_CMD

_BlitterWait:
	move.l	B_CMD,d0
	btst.l	#0,d0
	beq		_BlitterWait
;	jsr		_WaitForBlitter
	move.l	#_DoGpuBlit,G_PC
	move.l	#1,G_CTRL
	rts


	.phrase
_gpu_start:
	.GPU
	.ORG	G_RAM+$100

TEMPADDR	.REGEQU		r20
TEMPDATA	.REGEQU		r21
ADDINDEX	.REGEQU		r22
USEINDEX	.REGEQU		r23
BLITBASE	.REGEQU		r24
BLITTYPE	.REGEQU		r25
JUMPDEST	.REGEQU		r26
ORDATA		.REGEQU		r27

	.EVEN
_DoGpuBlit:
	movei	#_BlitAddIndex,TEMPADDR
	load	(TEMPADDR),ADDINDEX		; Get the Add Index

	movei	#_BlitUseIndex,TEMPADDR
	load	(TEMPADDR),USEINDEX		; Get the Use Index

	cmp		ADDINDEX,USEINDEX
	jr		EQ,_DoGpuBlit
	nop

BlitStart:
	movei	#80,BLITBASE
	mult	USEINDEX,BLITBASE
	movei	#_Blit,TEMPDATA
	add		TEMPDATA,BLITBASE		; Set Blit Base	{ArrayBase+(DataSize*UseIndex)}

	movei	#B_CMD,TEMPADDR			; Get Blitter status
BlitWait:
	load	(TEMPADDR),TEMPDATA		; Wait until Blitter is	not	busy
	btst	#0,TEMPDATA
	jr		EQ,BlitWait
	nop

	move   BLITBASE,r14

	load	(r14),BLITTYPE			; Get Blit Type

	movei	#BitmapBlit,JUMPDEST
	cmpq	#1,BLITTYPE
	jump	EQ,(JUMPDEST)
	nop

	movei	#ScaledBlit,JUMPDEST
	cmpq	#2,BLITTYPE
	jump	EQ,(JUMPDEST)
	nop

	movei	#ExpandImage,JUMPDEST
	cmpq	#3,BLITTYPE
	jump	EQ,(JUMPDEST)
	nop

	movei	#StopGpu,JUMPDEST
	cmpq	#4,BLITTYPE
	jump	EQ,(JUMPDEST)
	nop

PatternBlit:
	load	(r14+1),TEMPDATA		; Set A1_BASE
	movei	#A1_BASE,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+2),TEMPDATA		; Set A1_FLAGS
	movei	#A1_FLAGS,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#0,TEMPDATA				; Set A1_CLIP
	movei	#A1_CLIP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+4),TEMPDATA		; Set A1_PIXEL
	movei	#A1_PIXEL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+5),TEMPDATA		; Set A1_STEP
	movei	#A1_STEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#0,TEMPDATA		  		; Set A1_FSTEP
	movei	#A1_FSTEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#A1_FPIXEL,TEMPADDR		; Set A1_FPIXEL
	store	TEMPDATA,(TEMPADDR)

	movei	#A1_INC,TEMPADDR		; Set A1_INC
	store	TEMPDATA,(TEMPADDR)

	movei	#A1_FINC,TEMPADDR		; Set A1_FINC
	store	TEMPDATA,(TEMPADDR)

	movei	#A2_BASE,TEMPADDR		; Set A2_BASE
	store	TEMPDATA,(TEMPADDR)

	movei	#A2_FLAGS,TEMPADDR		; Set A2_FLAGS
	store	TEMPDATA,(TEMPADDR)

	movei	#A2_PIXEL,TEMPADDR		; Set A2_PIXEL
	store	TEMPDATA,(TEMPADDR)

	movei	#A2_STEP,TEMPADDR		; Set A2_STEP
	store	TEMPDATA,(TEMPADDR)

	movei	#OtherStuff,JUMPDEST
	jump	(JUMPDEST)
	nop

BitmapBlit:
	load	(r14+1),TEMPDATA		; Set A1_BASE
	movei	#A1_BASE,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+2),TEMPDATA		; Set A1_FLAGS
	movei	#A1_FLAGS,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#0,TEMPDATA				; Set A1_CLIP
	movei	#A1_CLIP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+4),TEMPDATA		; Set A1_PIXEL
	movei	#A1_PIXEL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+5),TEMPDATA		; Set A1_STEP
	movei	#A1_STEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#0,TEMPDATA				; Set A1_FSTEP
	movei	#A1_FSTEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#A1_FPIXEL,TEMPADDR		; Set A1_FPIXEL
	store	TEMPDATA,(TEMPADDR)

	movei	#A1_INC,TEMPADDR		; Set A1_INC
	store	TEMPDATA,(TEMPADDR)

	movei	#A1_FINC,TEMPADDR		; Set A1_FINC
	store	TEMPDATA,(TEMPADDR)

	load	(r14+10),TEMPDATA		; Set A2_BASE
	movei	#A2_BASE,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+11),TEMPDATA		; Set A2_FLAGS
	movei	#A2_FLAGS,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+12),TEMPDATA		; Set A2_PIXEL
	movei	#A2_PIXEL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+13),TEMPDATA		; Set A2_STEP
	movei	#A2_STEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#OtherStuff,JUMPDEST
	jump	(JUMPDEST)
	nop


ScaledBlit:
	load	(r14+1),TEMPDATA		; Set A1_BASE
	movei	#A1_BASE,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+2),TEMPDATA		; Set A1_FLAGS
	movei	#A1_FLAGS,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+3),TEMPDATA		; Set A1_CLIP
	movei	#A1_CLIP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+4),TEMPDATA		; Set A1_PIXEL
	movei	#A1_PIXEL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+5),TEMPDATA		; Set A1_STEP
	movei	#A1_STEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+6),TEMPDATA		; Set A1_FSTEP
	movei	#A1_FSTEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+7),TEMPDATA		; Set A1_FPIXEL
	movei	#A1_FPIXEL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)
  
  	load	(r14+8),TEMPDATA		; Set A1_INC
	movei	#A1_INC,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+9),TEMPDATA		; Set A1_FINC
	movei	#A1_FINC,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+10),TEMPDATA		; Set A2_BASE
	movei	#A2_BASE,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+11),TEMPDATA		; Set A2_FLAGS
	movei	#A2_FLAGS,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+12),TEMPDATA		; Set A2_PIXEL
	movei	#A2_PIXEL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+13),TEMPDATA		; Set A2_STEP
	movei	#A2_STEP,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#OtherStuff,JUMPDEST
	jump	(JUMPDEST)
	nop

ExpandImage:
	load	(r14+1),TEMPDATA		; Set A1_BASE
	movei	#lzoutbuf,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+10),TEMPDATA		; Set A2_BASE
	movei	#lzinbuf,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	movei	#DecompBuf,TEMPDATA	
	movei	#lzworkbuf,TEMPADDR	
	store	TEMPDATA,(TEMPADDR)

	movei	#delzss,JUMPDEST
	jump	(JUMPDEST)
	nop

	movei	#OtherStuff,JUMPDEST
	jump	(JUMPDEST)
	nop

StopGpu:
  	moveq	#0,TEMPDATA
	movei	#G_CTRL,TEMPADDR
	store	TEMPDATA,(TEMPADDR)
	nop
	nop

OtherStuff:
	load	(r14+14),TEMPDATA		; Set B_COUNT
	movei	#B_COUNT,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+15),TEMPDATA		; Set B_DSTZ
	movei	#B_DSTZ,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+16),TEMPDATA		; Set B_SRCZ1
	load	(r14+16),ORDATA
	shlq	#16,TEMPDATA
	or		ORDATA,TEMPDATA
	movei	#B_SRCZ1,TEMPADDR
	store	TEMPDATA,(TEMPADDR)
	movei	#B_SRCZ1+4,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+17),TEMPDATA		; Set B_PATD
	movei	#B_PATD,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+18),TEMPDATA		; Set B_IINC
	movei	#B_IINC,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

	load	(r14+19),TEMPDATA		; Set B_CMD
	movei	#B_CMD,TEMPADDR
	store	TEMPDATA,(TEMPADDR)

BlitDone:
	addq	#1,USEINDEX
	movei	#255,TEMPDATA
	and		TEMPDATA,USEINDEX
	movei	#_BlitUseIndex,TEMPADDR
	store	USEINDEX,(TEMPADDR)

	movei	#_DoGpuBlit,TEMPADDR
	jump	(TEMPADDR)
	nop
	nop

;  	moveq	#0,TEMPDATA
;	movei	#G_CTRL,TEMPADDR
;	store	TEMPDATA,(TEMPADDR)
;	nop
;	nop




;;; Register Equates
lzinbufptr		.equr	r0
lzoutbufptr		.equr	r1
lzworkbufptr	.equr	r2

currentpos		.equr	r3
ch				.equr	r4

addr			.equr	r5
temp			.equr	r6

matchlen		.equr	r7
matchpos		.equr	r8
mask			.equr	r9
rack			.equr	r10
bufmask			.equr	r11
bigmask			.equr	r12
startmask		.equr	r13

rbigloop		.equr	r14
mreg			.equr	r15
preg			.equr	r16
inner			.equr	r17
compressed		.equr	r18
done			.equr	r19

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; INPUT_BITS destreg,num_bits

MACRO	INPUT_BITS	destreg, num_bits
		moveq	#1,bigmask		; bigmask = 1 << (bit_count - 1)
		moveq	#0,\destreg

		shlq	#\num_bits-1,bigmask

		movei	#.m\~,mreg
		movei	#.p\~,preg
.m\~:
		cmpq	#0,bigmask		; If bigmask == 0 we're done
		jump	EQ,(preg)		; (1 wait)
		nop
		
		cmp	startmask,mask		; (1 wait)
		jr	NE,.n\~
		nop

		load	(lzinbufptr),rack	; Load new input byte
		addq	#4,lzinbufptr
.n\~:
		move	rack,temp		; if( rack & mask ) destreg |= bigmask
		and	mask,temp		; (1 wait)

		jr	EQ,.o\~			; (1 wait)
		nop

		or	bigmask,\destreg
.o\~:
		shrq	#1,bigmask		; bigmask >>= 1
		shrq	#1,mask			; mask >>= 1
		
		jump	NE,(mreg)		; (1 wait )
		nop

		move	startmask,mask
		jump	T,(mreg)
		nop
.p\~:
ENDM

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This is our routine entry point (instructions are interleaved for speed)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
delzss:
		movei	#lzinbuf,addr		; Address of variables
		movei	#$80000000,mask	  	; Mask to shift
		movei	#cstream,compressed	

		load	(addr),lzinbufptr	
		addq	#4,addr			; Next long variable

		movei	#$1FFF,bufmask		; Mask for 8k buffer
		load	(addr),lzoutbufptr
		addq	#4,addr			; Next long buffer

		load	(addr),lzworkbufptr

		movei	#bigloop,rbigloop	; Put in register for quick access
		move	mask,startmask		; Copy for later compares
		movei	#getchs,inner
		moveq	#1,currentpos		; Current position in window
		movei	#alldone,done
bigloop:
		cmp	startmask,mask		; Is mask $80?
		jr	NE,noreread
		nop

		load	(lzinbufptr),rack	; Get a new byte
		addq	#4,lzinbufptr
noreread:
		move	rack,temp		; Make a copy
		and	mask,temp		; Isolate bit (1 wait)

		shrq	#1,mask			; mask >>= 1
		jr	NE,havemask		; (1 wait)
		nop

		move	startmask,mask		; If mask == 0, mask = $80
havemask:
		cmpq	#0,temp			; Do final test to set ZERO flag
		jump	EQ,(compressed)		; If 0, get compressed stream	
		nop

		INPUT_BITS	ch,8

		storeb	ch,(lzoutbufptr)	; store byte
		addq	#1,lzoutbufptr		; go to next buffer pos

		and	bufmask,currentpos	; force range of 0-8191

		move	lzworkbufptr,addr  	; get address of window
		add	currentpos,addr		; add offset (1 wait)
		addq	#1,currentpos		; increment window pointer
		storeb	ch,(addr)		; update window

		jump	T,(rbigloop)		
		nop
cstream:
		INPUT_BITS	matchpos,13	; Get Index into Window

		cmpq	#0,matchpos		; END_OF_STREAM???
		jump	EQ,(done)		; (1 wait)
		nop
		
		INPUT_BITS	matchlen,4	; Get Length of Match
		addq	#BREAK_EVEN,matchlen
getchs:
		and	bufmask,matchpos
		move	lzworkbufptr,addr  	; Get Window Address + Offset
		and	bufmask,currentpos	; Range check currentpos
		add	matchpos,addr		; ^ avoids 1 wait

		loadb	(addr),ch		; Load a byte from window
		or	ch,ch			; (1 wait on purpose)
		storeb	ch,(lzoutbufptr)	; Store it to our buffer
		addq	#1,lzoutbufptr
					   
		move	lzworkbufptr,temp  	; Store byte in window
		add	currentpos,temp		; @ currentpos (1 wait)
		addq	#1,currentpos		; Update Window Position
		storeb	ch,(temp)		; Update Buffer
		
		addq	#1,matchpos		; Increment window read addr
		subq	#1,matchlen		; Decrement loop counter

		jump	PL,(inner)		; ->getchs (1 wait)
		nop

		jump	T,(rbigloop)
		nop

;;; Ok, we're done... now leave.
alldone:
	movei	#BlitDone,r0
	jump	(r0)
	nop
	nop

;		moveq	#0,temp
;		movei	#G_CTRL,addr
;		store	temp,(addr)
;		nop
;		nop


	.long

lzinbuf:
 	.dc.l	0		; Pointer to Compressed Data
lzoutbuf:
	.dc.l	0		; Pointer of Destination Buffer
lzworkbuf:
	.dc.l	0		; Pointer to 8k LZSS Window

_BlitAddIndex:
	.dc.l	0
_BlitUseIndex:
	.dc.l	0

	.LONG
	.68000
	.data

_gpu_end:
_gpu_size	.EQU	*-_gpu_start
	.GLOBL	_gpu_size
	.IF	_gpu_size>$1000
	.PRINT	"Code size (",/l/x _gpu_size,") is over $1000"
	.FAIL
	.ENDIF

	.bss

DecompBuf:
		.ds.l	4096

	.phrase
_Blit:
	.ds.b	20480

