Skip to content

Commit d91fd7d

Browse files
committed
Z sorting working
1 parent d3a4f4b commit d91fd7d

File tree

3 files changed

+47
-42
lines changed

3 files changed

+47
-42
lines changed

gbi.zsoex3.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ typedef __attribute__((aligned(8))) struct {
7070
const void* displayListStart;
7171
void* rdpFifoStart;
7272
void* rdpFifoEnd;
73+
void* debugBuffer;
7374
} UcodeArgs;
7475

7576
/**

rsp/setup.inc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,9 @@ ACC_LOWER equ 2
6969
.endif
7070
.align alignment
7171
.endmacro
72+
73+
.macro assert_alignment, alignment, errtext
74+
.if (. & (alignment - 1))
75+
.error errtext
76+
.endif
77+
.endmacro

zsoex3.s

Lines changed: 40 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ unused1:
3131

3232
viewport: // v31Value only used at init, so we can clobber it after that.
3333
// constants for register $v31
34-
.if (. & 15) != 0
35-
.error "Wrong alignment for v31value"
36-
.endif
34+
assert_alignment 16, "Wrong alignment for v31value"
3735
v31Value:
3836
// v31 must go from lowest to highest (signed) values for vcc patterns.
3937
// Also relies on the fact that $v31[0h] is -4,-4,-4,-4, 4, 4, 4, 4.
@@ -47,9 +45,7 @@ v31Value:
4745
.dh 0x7FFF // used a couple times
4846

4947
// constants for register vTRC
50-
.if (. & 15) != 0
51-
.error "Wrong alignment for vTRCValue"
52-
.endif
48+
assert_alignment 16, "Wrong alignment for vTRCValue"
5349
vTRCValue:
5450
vTRCValue0 equ cacheStart // Currently 0x1D0; for converting vertex index to address
5551
vTRCValue1 equ cacheEnd // Currently 0xCD8; for vtx mtx address
@@ -178,19 +174,14 @@ alphaCompareCullThresh:
178174
perspNorm:
179175
.skip 2
180176

177+
.align 4 // TODO
178+
181179
// First half of RDP value for split commands. Also used as temp storage for
182180
// tri vertices during tri commands.
183181
rdpHalf1Val:
184182
.skip 4
185183

186-
.align 8 // TODO
187-
188-
texrectState: // Only needs to be saved over texrect, half1, half2
189-
.skip 8 // TODO
190-
191-
.if (. & 3) != 0
192-
.error "cpuInterface must be aligned to 4"
193-
.endif
184+
assert_alignment 4, "cpuInterface misaligned"
194185
cpuInterface:
195186
ucodeTextStart:
196187
.skip 4
@@ -201,19 +192,23 @@ rdpFifoStart:
201192
.skip 4
202193
rdpFifoEnd:
203194
.skip 4
195+
debugBuffer:
196+
.skip 4
197+
198+
.align 8 // TODO
204199

205200
segmentTable:
206201
.skip (4 * 16) // 16 DRAM pointers
207202

208-
.if (. & 7) != 0
209-
.error "lightColors must be aligned to 16"
210-
.endif
203+
assert_alignment 8, "texrectState misaligned"
204+
texrectState: // Only needs to be saved over texrect, half1, half2
205+
.skip 8 // TODO
206+
207+
assert_alignment 8, "lightColors misaligned"
211208
lightColors:
212209
.skip 16
213210

214-
.if (. & 7) != 0
215-
.error "cacheStart must be aligned to 8"
216-
.endif
211+
assert_alignment 8, "cacheStart misaligned"
217212
cacheStart:
218213
219214
INPUT_BUFFER_CMDS equ 21
@@ -612,8 +607,6 @@ G_ZSOSECTION_handler:
612607
srl subSecOfsShf, cmd_w0, 22 // Offsets are left shifted by this much
613608
jal while_wait_dma_busy
614609
andi subSecOfsShf, subSecOfsShf, 3
615-
j sort_done_regular // TODO
616-
nop
617610
// Convert reference vertices from index -> address
618611
lpv $v2, (0x20)(sectionBase)
619612
lpv $v3, (0x28)(sectionBase)
@@ -644,7 +637,7 @@ G_ZSOSECTION_handler:
644637
lhu $3, (VTX_SCR_Z)($3)
645638
addi subSec, subSec, 1
646639
bne subSec, subSecEnd, @@loop
647-
sw $3, (0x40 - 2)($1)
640+
sh $3, (0x40 - 2)($1)
648641
// Load offsets and Zs
649642
lqv $v20, (0x40)(rdpCmdBufEndP1)
650643
lpv $v21, (0x00)(sectionBase)
@@ -657,7 +650,9 @@ G_ZSOSECTION_handler:
657650
lpv $v27, (0x18)(sectionBase)
658651
// Optimal 8 element sorting network from
659652
// https://bertdobbelaere.github.io/sorting_networks.html#N8L19D6
660-
// Elements 0 and 1 from each of 4 vectors
653+
// Elements 0 and 1 from each of 4 vectors (a, b, c, d). 4 groups of these in parallel
654+
// E.g. a0 is $v20[0], a1 is $v20[1], b0 is $v22[0], etc.
655+
// Then another a0 is $v20[2], and another is $v20[4], and last is $v20[6]
661656
.macro sort_swap, ozh, oih, ozl, oil, z0, i0, z1, i1
662657
vge ozh, z0, z1
663658
vmrg oih, i0, i1
@@ -672,7 +667,7 @@ G_ZSOSECTION_handler:
672667
.endmacro
673668
sort_swap $v12, $v13, $v14, $v15, $v20, $v21, $v22, $v23 // swap(a0, b0), swap(a1, b1)
674669
lb $24, alphaCompareCullMode
675-
sort_swap $v15, $v17, $v18, $v19, $v24, $v25, $v26, $v27 // swap(c0, d0), swap(c1, d1)
670+
sort_swap $v16, $v17, $v18, $v19, $v24, $v25, $v26, $v27 // swap(c0, d0), swap(c1, d1)
676671
lb $10, alphaCompareCullThresh
677672
sort_swap $v20, $v21, $v24, $v25, $v12, $v13, $v16, $v17 // swap(a0, c0), swap(a1, c1)
678673
sra $11, $24, 31 // -1 if ABOVE, else 0
@@ -696,42 +691,46 @@ G_ZSOSECTION_handler:
696691
// Element 0 of these regs are sorted in order, same for 2, 4, 6
697692
// These regs + 1 = indices
698693
// $v12, $v14, $v10, $v22, $v16, $v20, $v18, $v26
699-
veq $v29, $v31, $v31[0h] // vcc = 10101010
694+
veq $v29, $v31, $v31[0q] // vcc = 10101010
700695
li $24, 0xFF
701-
vmrg $v12, $v12, $v13[0h] // Interleave Z, index, Z, index, Z, index, Z, index
696+
vmrg $v12, $v12, $v13[0q] // Interleave Z, index, Z, index, Z, index, Z, index
702697
addi $1, rdpCmdBufEndP1, 0x0
703-
vmrg $v14, $v14, $v15[0h]
698+
vmrg $v14, $v14, $v15[0q]
704699
addi $2, rdpCmdBufEndP1, 0x4
705-
vmrg $v10, $v10, $v11[0h]
700+
vmrg $v10, $v10, $v11[0q]
706701
addi $3, rdpCmdBufEndP1, 0x8
707-
vmrg $v22, $v22, $v23[0h]
702+
vmrg $v22, $v22, $v23[0q]
708703
addi $4, rdpCmdBufEndP1, 0xC
709-
vmrg $v16, $v16, $v17[0h]
704+
vmrg $v16, $v16, $v17[0q]
710705
sqv $v12, (0x00)(rdpCmdBufEndP1)
711-
vmrg $v20, $v20, $v21[0h]
706+
vmrg $v20, $v20, $v21[0q]
712707
sqv $v14, (0x10)(rdpCmdBufEndP1)
713-
vmrg $v18, $v18, $v19[0h]
708+
vmrg $v18, $v18, $v19[0q]
714709
sqv $v10, (0x20)(rdpCmdBufEndP1)
715-
vmrg $v26, $v26, $v27[0h]
716-
sqv $v22, (0x30)(rdpCmdBufEndP1)
717710
vmrg $v12, $v12, $v31[2] // 0; clear indices
711+
sqv $v22, (0x30)(rdpCmdBufEndP1)
712+
vmrg $v26, $v26, $v27[0q]
718713
sqv $v16, (0x40)(rdpCmdBufEndP1)
714+
vnop
719715
sqv $v20, (0x50)(rdpCmdBufEndP1)
716+
vnop
720717
sqv $v18, (0x60)(rdpCmdBufEndP1)
718+
vadd $v13, $v12, $v31[1] // -1; subtract 1 from Z values
721719
j merge_sort_entry
722720
sqv $v26, (0x70)(rdpCmdBufEndP1)
723-
721+
724722
merge_sort_loop:
725-
beqz $10, sort_done_z0 // Z=0, stop early
723+
vadd $v13, $v12, $v31[1] // -1; subtract 1 from Z values to turn vlt into vle
724+
blez $10, sort_done_z0 // Z<=0, stop early
726725
addi subSec, subSec, 1
727726
beq subSec, subSecEnd, sort_done_regular
728-
sb $11, (0)(subSec)
727+
sb $11, (-1)(subSec)
729728
merge_sort_entry:
730-
vge $v29, $v12, $v12[0] // Is the head of list 0 the highest?
729+
vlt $v29, $v13, $v12[0] // v12-1 < v12[0] :=: v12 <= v12[0] :=: v12[0] >= v12
731730
cfc2 $20, $vcc
732-
vge $v29, $v12, $v12[2] // Or the head of list 2?
731+
vlt $v29, $v13, $v12[2] // Or the head of list 2?
733732
cfc2 $10, $vcc
734-
vge $v29, $v12, $v12[4] // Or list 4?
733+
vlt $v29, $v13, $v12[4] // Or list 4?
735734
beq $20, $24, merge_sort_list_0
736735
cfc2 $11, $vcc
737736
beq $10, $24, merge_sort_list_2
@@ -1368,7 +1367,6 @@ old_return_routine:
13681367
// Has mfc0 in branch delay slot, causes a stall if first instr after ret is load
13691368

13701369
dma_read_write:
1371-
shared_dma_read_write:
13721370
mfc0 $11, SP_DMA_FULL // load the DMA_FULL value
13731371
@@while_dma_full:
13741372
bnez $11, @@while_dma_full // Loop until DMA_FULL is cleared

0 commit comments

Comments
 (0)