Skip to content

Commit

Permalink
#368 GBA rasterizer optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
XProger committed Dec 30, 2021
1 parent e7cb407 commit 9bcc846
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 152 deletions.
12 changes: 11 additions & 1 deletion src/platform/gba/common_asm.inc
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,14 @@ MAX_INT32 = 0x7FFFFFFF
sub vy2, vy2, vy0
mlas tmp, vx1, vy2, tmp
bgt \skip
.endm
.endm

.macro tex index, uv
and \index, \uv, #0xFF00
orr \index, \uv, lsr #24 // index = t.v * 256 + t.u
ldrb \index, [TILE, \index]
.endm

.macro lit index
ldrb \index, [LMAP, \index]
.endm
44 changes: 20 additions & 24 deletions src/platform/gba/rasterizeFTA_mode4.s
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ N .req r6
Lh .req r7
Rh .req r8

Lx .req ip
Rx .req lr
Lt .req r9
Rt .req r10
h .req r11
Lx .req r9
Rx .req r10
Lt .req r11
Rt .req r12
h .req lr

Ldx .req h
Rdx .req h
Expand Down Expand Up @@ -49,20 +49,21 @@ Rduv .req h
Rdu .req N
Rdv .req h

sLdx .req tmp
sLdt .req N
sRdx .req Lh
sRdt .req Rh

SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12

.macro PUT_PIXELS
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
tex indexA, t
add t, dtdx

and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
tex indexB, t
add t, dtdx

// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
Expand All @@ -78,7 +79,7 @@ SP_RDT = 12

.global rasterizeFTA_mode4_asm
rasterizeFTA_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt]

mov LMAP, #LMAP_ADDR
Expand Down Expand Up @@ -273,17 +274,12 @@ rasterizeFTA_mode4_asm:
bne .scanline_block_8px

.scanline_end:
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack

ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack

ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack

ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
add Rt, sRdt

add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240)

Expand All @@ -295,4 +291,4 @@ rasterizeFTA_mode4_asm:

.exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}
56 changes: 26 additions & 30 deletions src/platform/gba/rasterizeFT_mode4.s
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ N .req r6
Lh .req r7
Rh .req r8

Lx .req ip
Rx .req lr
Lt .req r9
Rt .req r10
h .req r11
Lx .req r9
Rx .req r10
Lt .req r11
Rt .req r12
h .req lr

Ldx .req h
Rdx .req h
Expand Down Expand Up @@ -49,39 +49,40 @@ Rduv .req h
Rdu .req N
Rdv .req h

sLdx .req tmp
sLdt .req N
sRdx .req Lh
sRdt .req Rh

SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12

.macro PUT_PIXELS
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
tex indexA, t
lit indexA

#ifndef TEX_2PX
add t, dtdx

and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
ldrb indexB, [LMAP, indexB]
tex indexB, t
lit indexB
add t, dtdx

orr indexA, indexB, lsl #8
strh indexA, [ptr], #2
#else
add t, dtdx, lsl #1

//orr indexA, indexA, lsl #8
strb indexA, [tmp], #2 // writing a byte to GBA VRAM will write a half word for free
#endif

strb indexA, [tmp], #2
.endm

.global rasterizeFT_mode4_asm
rasterizeFT_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt]

mov LMAP, #LMAP_ADDR
Expand Down Expand Up @@ -273,26 +274,21 @@ rasterizeFT_mode4_asm:
bne .scanline_block_8px

.scanline_end:
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack

ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack

ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack

ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
add Rt, sRdt

add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240)
add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240)

subs h, #1
bne .scanline_start

ldmfd sp!, {L,R,Lh,Rh} // sp+16
ldmfd sp!, {L,R,Lh,Rh} // sp+16
b .loop

.exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}
21 changes: 11 additions & 10 deletions src/platform/gba/rasterizeF_mode4.s
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ R .req r2
index .req r3
Lh .req r4
Rh .req r5
Lx .req ip
Rx .req lr
Ldx .req r6
Rdx .req r7
N .req r8
tmp .req r9
DIVLUT .req r10
width .req r11
Lx .req r6
Rx .req r7
Ldx .req r8
Rdx .req r9
N .req r10
tmp .req r11
DIVLUT .req r12
width .req lr

h .req N
Ry1 .req tmp
Ry2 .req Rh
Expand All @@ -25,7 +26,7 @@ blocks .req DIVLUT

.global rasterizeF_mode4_asm
rasterizeF_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}

mov LMAP, #LMAP_ADDR

Expand Down Expand Up @@ -136,4 +137,4 @@ rasterizeF_mode4_asm:
b .loop

.exit:
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}
39 changes: 18 additions & 21 deletions src/platform/gba/rasterizeGTA_mode4.s
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ Rdv .req N
Rti .req tmp
Rgi .req tmp

sLdx .req L
sLdg .req R
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req N // not used in ldm due h collision

SP_LDX = 0
SP_LDG = 4
SP_LDT = 8
Expand All @@ -81,14 +88,10 @@ SP_RDT = 20
bic LMAP, g, #255
add g, dgdx

and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
tex indexA, t
add t, dtdx

and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
tex indexB, t
add t, dtdx

// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
Expand All @@ -104,7 +107,7 @@ SP_RDT = 20

.global rasterizeGTA_mode4_asm
rasterizeGTA_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]

mov Lh, #0 // Lh = 0
Expand Down Expand Up @@ -339,20 +342,14 @@ rasterizeGTA_mode4_asm:
.scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24

ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack

ldr tmp, [sp, #(SP_LDG + 16)]
add Lg, tmp // Lg += Ldg from stack

ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack

ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}

ldr tmp, [sp, #(SP_RDG + 16)]
add Rg, tmp // Rg += Rdg from stack
add Lx, sLdx
add Lg, sLdg
add Lt, sLdt
add Rx, sRdx
add Rg, sRdg

ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
Expand All @@ -367,4 +364,4 @@ rasterizeGTA_mode4_asm:

.exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}
Loading

0 comments on commit 9bcc846

Please sign in to comment.