@@ -31,9 +31,7 @@ unused1:
3131
3232viewport: // v31Value only used at init , so we can clobber it after th at .
3333// constants for register $ v31
34- .if (. & 15 ) != 0
35- .error "Wrong alignment for v31value"
36- .endif
34+ assert_alignment 16 , "Wrong alignment for v31value"
3735v31Value:
3836// v31 must go from lowest to highest (signed) values for vcc patterns.
3937// Also relies on the fact th at $ v31 [ 0h ] is - 4 ,- 4 ,- 4 ,- 4 , 4 , 4 , 4 , 4 .
@@ -47,9 +45,7 @@ v31Value:
4745 . dh 0x7FFF // used a couple times
4846
4947// constants for register vTRC
50- .if (. & 15 ) != 0
51- .error "Wrong alignment for vTRCValue"
52- .endif
48+ assert_alignment 16 , "Wrong alignment for vTRCValue"
5349vTRCValue:
5450vTRCValue0 equ cacheStart // Currently 0x1D0 ; for converting vertex index to address
5551vTRCValue1 equ cacheEnd // Currently 0xCD8 ; for vtx mtx address
@@ -178,19 +174,14 @@ alphaCompareCullThresh:
178174perspNorm:
179175 .skip 2
180176
177+ . align 4 // TODO
178+
181179// First half of RDP value for split commands. Also used as temp storage for
182180// tri vertices during tri commands.
183181rdpHalf1Val:
184182 .skip 4
185183
186- . align 8 // TODO
187-
188- texrectState: // Only needs to be saved over texrect , half1 , half2
189- .skip 8 // TODO
190-
191- .if (. & 3 ) != 0
192- .error "cpuInterface must be aligned to 4"
193- .endif
184+ assert_alignment 4 , "cpuInterface misaligned"
194185cpuInterface:
195186ucodeTextStart:
196187 .skip 4
@@ -201,19 +192,23 @@ rdpFifoStart:
201192 .skip 4
202193rdpFifoEnd:
203194 .skip 4
195+ debugBuffer:
196+ .skip 4
197+
198+ . align 8 // TODO
204199
205200segmentTable:
206201 .skip ( 4 * 16 ) // 16 DRAM pointers
207202
208- .if (. & 7 ) != 0
209- .error "lightColors must be aligned to 16"
210- .endif
203+ assert_alignment 8 , "texrectState misaligned"
204+ texrectState: // Only needs to be saved over texrect , half1 , half2
205+ .skip 8 // TODO
206+
207+ assert_alignment 8 , "lightColors misaligned"
211208lightColors:
212209 .skip 16
213210
214- .if (. & 7 ) != 0
215- .error "cacheStart must be aligned to 8"
216- .endif
211+ assert_alignment 8 , "cacheStart misaligned"
217212cacheStart:
218213
219214INPUT_BUFFER_CMDS equ 21
@@ -612,8 +607,6 @@ G_ZSOSECTION_handler:
612607 srl subSecOfsShf , cmd_w0 , 22 // Offsets are left shifted by this much
613608 jal while_wait_dma_busy
614609 andi subSecOfsShf , subSecOfsShf , 3
615- j sort_done_regular // TODO
616- nop
617610 // Convert reference vertices from index - > address
618611 lpv $ v2 , ( 0x20 )(sectionBase)
619612 lpv $ v3 , ( 0x28 )(sectionBase)
@@ -644,7 +637,7 @@ G_ZSOSECTION_handler:
644637 lhu $ 3 , (VTX_SCR_Z)( $ 3 )
645638 addi subSec , subSec , 1
646639 bne subSec , subSecEnd , @@ loop
647- sw $ 3 , ( 0x40 - 2 )( $ 1 )
640+ sh $ 3 , ( 0x40 - 2 )( $ 1 )
648641 // Load offsets and Zs
649642 lqv $ v20 , ( 0x40 )(rdpCmdBufEndP1)
650643 lpv $ v21 , ( 0x00 )(sectionBase)
@@ -657,7 +650,9 @@ G_ZSOSECTION_handler:
657650 lpv $ v27 , ( 0x18 )(sectionBase)
658651 // Optimal 8 element sorting network from
659652 // https://bertdobbelaere.github.io/sorting_networks.html#N8L19D6
660- // Elements 0 and 1 from each of 4 vectors
653+ // Elements 0 and 1 from each of 4 vectors (a , b , c , d). 4 groups of these in parallel
654+ // E.g. a0 is $ v20 [ 0 ], a1 is $ v20 [ 1 ], b0 is $ v22 [ 0 ], etc.
655+ // Then another a0 is $ v20 [ 2 ], and another is $ v20 [ 4 ], and last is $ v20 [ 6 ]
661656.macro sort_swap , ozh , oih , ozl , oil , z0 , i0 , z1 , i1
662657 vge ozh , z0 , z1
663658 vmrg oih , i0 , i1
@@ -672,7 +667,7 @@ G_ZSOSECTION_handler:
672667.endmacro
673668 sort_swap $ v12 , $ v13 , $ v14 , $ v15 , $ v20 , $ v21 , $ v22 , $ v23 // swap(a0 , b0) , swap(a1 , b1)
674669 lb $ 24 , alphaCompareCullMode
675- sort_swap $ v15 , $ v17 , $ v18 , $ v19 , $ v24 , $ v25 , $ v26 , $ v27 // swap(c0 , d0) , swap(c1 , d1)
670+ sort_swap $ v16 , $ v17 , $ v18 , $ v19 , $ v24 , $ v25 , $ v26 , $ v27 // swap(c0 , d0) , swap(c1 , d1)
676671 lb $ 10 , alphaCompareCullThresh
677672 sort_swap $ v20 , $ v21 , $ v24 , $ v25 , $ v12 , $ v13 , $ v16 , $ v17 // swap(a0 , c0) , swap(a1 , c1)
678673 sra $ 11 , $ 24 , 31 // - 1 if ABOVE , else 0
@@ -696,42 +691,46 @@ G_ZSOSECTION_handler:
696691 // Element 0 of these regs are sorted in order , same for 2 , 4 , 6
697692 // These regs + 1 = indices
698693 // $ v12 , $ v14 , $ v10 , $ v22 , $ v16 , $ v20 , $ v18 , $ v26
699- veq $ v29 , $ v31 , $ v31 [ 0h ] // vcc = 10101010
694+ veq $ v29 , $ v31 , $ v31 [ 0q ] // vcc = 10101010
700695 li $ 24 , 0xFF
701- vmrg $ v12 , $ v12 , $ v13 [ 0h ] // Interleave Z , index , Z , index , Z , index , Z , index
696+ vmrg $ v12 , $ v12 , $ v13 [ 0q ] // Interleave Z , index , Z , index , Z , index , Z , index
702697 addi $ 1 , rdpCmdBufEndP1 , 0x0
703- vmrg $ v14 , $ v14 , $ v15 [ 0h ]
698+ vmrg $ v14 , $ v14 , $ v15 [ 0q ]
704699 addi $ 2 , rdpCmdBufEndP1 , 0x4
705- vmrg $ v10 , $ v10 , $ v11 [ 0h ]
700+ vmrg $ v10 , $ v10 , $ v11 [ 0q ]
706701 addi $ 3 , rdpCmdBufEndP1 , 0x8
707- vmrg $ v22 , $ v22 , $ v23 [ 0h ]
702+ vmrg $ v22 , $ v22 , $ v23 [ 0q ]
708703 addi $ 4 , rdpCmdBufEndP1 , 0xC
709- vmrg $ v16 , $ v16 , $ v17 [ 0h ]
704+ vmrg $ v16 , $ v16 , $ v17 [ 0q ]
710705 sqv $ v12 , ( 0x00 )(rdpCmdBufEndP1)
711- vmrg $ v20 , $ v20 , $ v21 [ 0h ]
706+ vmrg $ v20 , $ v20 , $ v21 [ 0q ]
712707 sqv $ v14 , ( 0x10 )(rdpCmdBufEndP1)
713- vmrg $ v18 , $ v18 , $ v19 [ 0h ]
708+ vmrg $ v18 , $ v18 , $ v19 [ 0q ]
714709 sqv $ v10 , ( 0x20 )(rdpCmdBufEndP1)
715- vmrg $ v26 , $ v26 , $ v27 [ 0h ]
716- sqv $ v22 , ( 0x30 )(rdpCmdBufEndP1)
717710 vmrg $ v12 , $ v12 , $ v31 [ 2 ] // 0 ; clear indices
711+ sqv $ v22 , ( 0x30 )(rdpCmdBufEndP1)
712+ vmrg $ v26 , $ v26 , $ v27 [ 0q ]
718713 sqv $ v16 , ( 0x40 )(rdpCmdBufEndP1)
714+ vnop
719715 sqv $ v20 , ( 0x50 )(rdpCmdBufEndP1)
716+ vnop
720717 sqv $ v18 , ( 0x60 )(rdpCmdBufEndP1)
718+ vadd $ v13 , $ v12 , $ v31 [ 1 ] // - 1 ; subtract 1 from Z values
721719 j merge_sort_entry
722720 sqv $ v26 , ( 0x70 )(rdpCmdBufEndP1)
723-
721+
724722merge_sort_loop:
725- beqz $ 10 , sort_done_z0 // Z= 0 , stop early
723+ vadd $ v13 , $ v12 , $ v31 [ 1 ] // - 1 ; subtract 1 from Z values to turn vlt into vle
724+ blez $ 10 , sort_done_z0 // Z<= 0 , stop early
726725 addi subSec , subSec , 1
727726 beq subSec , subSecEnd , sort_done_regular
728- sb $ 11 , ( 0 )(subSec)
727+ sb $ 11 , ( - 1 )(subSec)
729728merge_sort_entry:
730- vge $ v29 , $ v12 , $ v12 [ 0 ] // Is the head of list 0 the highest?
729+ vlt $ v29 , $ v13 , $ v12 [ 0 ] // v12 - 1 < v12 [ 0 ] :=: v12 <= v12 [ 0 ] :=: v12 [ 0 ] >= v12
731730 cfc2 $ 20 , $ vcc
732- vge $ v29 , $ v12 , $ v12 [ 2 ] // Or the head of list 2 ?
731+ vlt $ v29 , $ v13 , $ v12 [ 2 ] // Or the head of list 2 ?
733732 cfc2 $ 10 , $ vcc
734- vge $ v29 , $ v12 , $ v12 [ 4 ] // Or list 4 ?
733+ vlt $ v29 , $ v13 , $ v12 [ 4 ] // Or list 4 ?
735734 beq $ 20 , $ 24 , merge_sort_list_0
736735 cfc2 $ 11 , $ vcc
737736 beq $ 10 , $ 24 , merge_sort_list_2
@@ -1368,7 +1367,6 @@ old_return_routine:
13681367 // Has mfc0 in branch delay slot , causes a stall if first instr after ret is load
13691368
13701369dma_read_write:
1371- shared_dma_read_write:
13721370 mfc0 $ 11 , SP_DMA_FULL // load the DMA_FULL value
13731371@@while_dma_full:
13741372 bnez $ 11 , @@while_dma_full // Loop until DMA_FULL is cleared
0 commit comments