From 9bcc8468d0f5120b3d3c0108f60b821fc074431c Mon Sep 17 00:00:00 2001 From: XProger Date: Thu, 30 Dec 2021 09:56:40 +0300 Subject: [PATCH] #368 GBA rasterizer optimization --- src/platform/gba/common_asm.inc | 12 +++++- src/platform/gba/rasterizeFTA_mode4.s | 44 ++++++++++----------- src/platform/gba/rasterizeFT_mode4.s | 56 +++++++++++++-------------- src/platform/gba/rasterizeF_mode4.s | 21 +++++----- src/platform/gba/rasterizeGTA_mode4.s | 39 +++++++++---------- src/platform/gba/rasterizeGT_mode4.s | 46 +++++++++++----------- src/platform/gba/rasterizeG_mode4.s | 36 ++++++++--------- src/platform/gba/rasterizeS_mode4.s | 20 +++++----- src/platform/gba/transformMesh.s | 16 ++++---- src/platform/gba/transformRoom.s | 16 ++++---- 10 files changed, 154 insertions(+), 152 deletions(-) diff --git a/src/platform/gba/common_asm.inc b/src/platform/gba/common_asm.inc index d46fe618..03831d3a 100644 --- a/src/platform/gba/common_asm.inc +++ b/src/platform/gba/common_asm.inc @@ -71,4 +71,14 @@ MAX_INT32 = 0x7FFFFFFF sub vy2, vy2, vy0 mlas tmp, vx1, vy2, tmp bgt \skip -.endm \ No newline at end of file +.endm + +.macro tex index, uv + and \index, \uv, #0xFF00 + orr \index, \uv, lsr #24 // index = t.v * 256 + t.u + ldrb \index, [TILE, \index] +.endm + +.macro lit index + ldrb \index, [LMAP, \index] +.endm diff --git a/src/platform/gba/rasterizeFTA_mode4.s b/src/platform/gba/rasterizeFTA_mode4.s index 5c0fa054..010cfcf8 100644 --- a/src/platform/gba/rasterizeFTA_mode4.s +++ b/src/platform/gba/rasterizeFTA_mode4.s @@ -11,11 +11,11 @@ N .req r6 Lh .req r7 Rh .req r8 -Lx .req ip -Rx .req lr -Lt .req r9 -Rt .req r10 -h .req r11 +Lx .req r9 +Rx .req r10 +Lt .req r11 +Rt .req r12 +h .req lr Ldx .req h Rdx .req h @@ -49,20 +49,21 @@ Rduv .req h Rdu .req N Rdv .req h +sLdx .req tmp +sLdt .req N +sRdx .req Lh +sRdt .req Rh + SP_LDX = 0 SP_LDT = 4 SP_RDX = 8 SP_RDT = 12 .macro PUT_PIXELS - and indexA, t, #0xFF00 - orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u - ldrb indexA, [TILE, indexA] + tex indexA, t add t, dtdx - and indexB, t, #0xFF00 - orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u - ldrb indexB, [TILE, indexB] + tex indexB, t add t, dtdx // cheap non-accurate alpha test, skip pixels pair if one or both are transparent @@ -78,7 +79,7 @@ SP_RDT = 12 .global rasterizeFTA_mode4_asm rasterizeFTA_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt] mov LMAP, #LMAP_ADDR @@ -273,17 +274,12 @@ rasterizeFTA_mode4_asm: bne .scanline_block_8px .scanline_end: - ldr tmp, [sp, #(SP_LDX + 16)] - add Lx, tmp // Lx += Ldx from stack - - ldr tmp, [sp, #(SP_LDT + 16)] - add Lt, tmp // Lt += Ldt from stack - - ldr tmp, [sp, #(SP_RDX + 16)] - add Rx, tmp // Rx += Rdx from stack - - ldr tmp, [sp, #(SP_RDT + 16)] - add Rt, tmp // Rt += Rdt from stack + add tmp, sp, #16 + ldmia tmp, {sLdx, sLdt, sRdx, sRdt} + add Lx, sLdx + add Lt, sLdt + add Rx, sRdx + add Rt, sRdt add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240) @@ -295,4 +291,4 @@ rasterizeFTA_mode4_asm: .exit: add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt] - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/rasterizeFT_mode4.s b/src/platform/gba/rasterizeFT_mode4.s index 321eeb83..859715b1 100644 --- a/src/platform/gba/rasterizeFT_mode4.s +++ b/src/platform/gba/rasterizeFT_mode4.s @@ -11,11 +11,11 @@ N .req r6 Lh .req r7 Rh .req r8 -Lx .req ip -Rx .req lr -Lt .req r9 -Rt .req r10 -h .req r11 +Lx .req r9 +Rx .req r10 +Lt .req r11 +Rt .req r12 +h .req lr Ldx .req h Rdx .req h @@ -49,39 +49,40 @@ Rduv .req h Rdu .req N Rdv .req h +sLdx .req tmp +sLdt .req N +sRdx .req Lh +sRdt .req Rh + SP_LDX = 0 SP_LDT = 4 SP_RDX = 8 SP_RDT = 12 .macro PUT_PIXELS - and indexA, t, #0xFF00 - orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u - ldrb indexA, [TILE, indexA] - ldrb indexA, [LMAP, indexA] + tex indexA, t + lit indexA #ifndef TEX_2PX add t, dtdx - and indexB, t, #0xFF00 - orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u - ldrb indexB, [TILE, indexB] - ldrb indexB, [LMAP, indexB] + tex indexB, t + lit indexB add t, dtdx orr indexA, indexB, lsl #8 + strh indexA, [ptr], #2 #else add t, dtdx, lsl #1 //orr indexA, indexA, lsl #8 + strb indexA, [tmp], #2 // writing a byte to GBA VRAM will write a half word for free #endif - - strb indexA, [tmp], #2 .endm .global rasterizeFT_mode4_asm rasterizeFT_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt] mov LMAP, #LMAP_ADDR @@ -273,26 +274,21 @@ rasterizeFT_mode4_asm: bne .scanline_block_8px .scanline_end: - ldr tmp, [sp, #(SP_LDX + 16)] - add Lx, tmp // Lx += Ldx from stack - - ldr tmp, [sp, #(SP_LDT + 16)] - add Lt, tmp // Lt += Ldt from stack - - ldr tmp, [sp, #(SP_RDX + 16)] - add Rx, tmp // Rx += Rdx from stack - - ldr tmp, [sp, #(SP_RDT + 16)] - add Rt, tmp // Rt += Rdt from stack + add tmp, sp, #16 + ldmia tmp, {sLdx, sLdt, sRdx, sRdt} + add Lx, sLdx + add Lt, sLdt + add Rx, sRdx + add Rt, sRdt - add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240) + add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240) subs h, #1 bne .scanline_start - ldmfd sp!, {L,R,Lh,Rh} // sp+16 + ldmfd sp!, {L,R,Lh,Rh} // sp+16 b .loop .exit: add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt] - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/rasterizeF_mode4.s b/src/platform/gba/rasterizeF_mode4.s index cffd2c22..d7bf536a 100644 --- a/src/platform/gba/rasterizeF_mode4.s +++ b/src/platform/gba/rasterizeF_mode4.s @@ -6,14 +6,15 @@ R .req r2 index .req r3 Lh .req r4 Rh .req r5 -Lx .req ip -Rx .req lr -Ldx .req r6 -Rdx .req r7 -N .req r8 -tmp .req r9 -DIVLUT .req r10 -width .req r11 +Lx .req r6 +Rx .req r7 +Ldx .req r8 +Rdx .req r9 +N .req r10 +tmp .req r11 +DIVLUT .req r12 +width .req lr + h .req N Ry1 .req tmp Ry2 .req Rh @@ -25,7 +26,7 @@ blocks .req DIVLUT .global rasterizeF_mode4_asm rasterizeF_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} mov LMAP, #LMAP_ADDR @@ -136,4 +137,4 @@ rasterizeF_mode4_asm: b .loop .exit: - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/rasterizeGTA_mode4.s b/src/platform/gba/rasterizeGTA_mode4.s index 51343f98..0ef5ad69 100644 --- a/src/platform/gba/rasterizeGTA_mode4.s +++ b/src/platform/gba/rasterizeGTA_mode4.s @@ -70,6 +70,13 @@ Rdv .req N Rti .req tmp Rgi .req tmp +sLdx .req L +sLdg .req R +sLdt .req Lh +sRdx .req Rh +sRdg .req tmp +sRdt .req N // not used in ldm due h collision + SP_LDX = 0 SP_LDG = 4 SP_LDT = 8 @@ -81,14 +88,10 @@ SP_RDT = 20 bic LMAP, g, #255 add g, dgdx - and indexA, t, #0xFF00 - orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u - ldrb indexA, [TILE, indexA] + tex indexA, t add t, dtdx - and indexB, t, #0xFF00 - orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u - ldrb indexB, [TILE, indexB] + tex indexB, t add t, dtdx // cheap non-accurate alpha test, skip pixels pair if one or both are transparent @@ -104,7 +107,7 @@ SP_RDT = 20 .global rasterizeGTA_mode4_asm rasterizeGTA_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] mov Lh, #0 // Lh = 0 @@ -339,20 +342,14 @@ rasterizeGTA_mode4_asm: .scanline_end: ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24 - ldr tmp, [sp, #(SP_LDX + 16)] - add Lx, tmp // Lx += Ldx from stack - - ldr tmp, [sp, #(SP_LDG + 16)] - add Lg, tmp // Lg += Ldg from stack - - ldr tmp, [sp, #(SP_LDT + 16)] - add Lt, tmp // Lt += Ldt from stack - - ldr tmp, [sp, #(SP_RDX + 16)] - add Rx, tmp // Rx += Rdx from stack + add tmp, sp, #16 + ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg} - ldr tmp, [sp, #(SP_RDG + 16)] - add Rg, tmp // Rg += Rdg from stack + add Lx, sLdx + add Lg, sLdg + add Lt, sLdt + add Rx, sRdx + add Rg, sRdg ldr tmp, [sp, #(SP_RDT + 16)] add Rt, tmp // Rt += Rdt from stack @@ -367,4 +364,4 @@ rasterizeGTA_mode4_asm: .exit: add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/rasterizeGT_mode4.s b/src/platform/gba/rasterizeGT_mode4.s index e5090a8b..902200b3 100644 --- a/src/platform/gba/rasterizeGT_mode4.s +++ b/src/platform/gba/rasterizeGT_mode4.s @@ -70,6 +70,13 @@ Rdv .req N Rti .req tmp Rgi .req tmp +sLdx .req L +sLdg .req R +sLdt .req Lh +sRdx .req Rh +sRdg .req tmp +sRdt .req N // not used in ldm due h collision + SP_LDX = 0 SP_LDG = 4 SP_LDT = 8 @@ -81,18 +88,15 @@ SP_RDT = 20 bic LMAP, g, #255 add g, dgdx - and indexA, t, #0xFF00 - orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u - ldrb indexA, [TILE, indexA] - ldrb indexA, [LMAP, indexA] + tex indexA, t + lit indexA #ifndef TEX_2PX add t, dtdx - and indexB, t, #0xFF00 - orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u - ldrb indexB, [TILE, indexB] - ldrb indexB, [LMAP, indexB] + tex indexB, t + lit indexB + add t, dtdx orr indexA, indexB, lsl #8 @@ -101,13 +105,13 @@ SP_RDT = 20 add t, dtdx, lsl #1 //orr indexA, indexA, lsl #8 - strb indexA, [ptr], #2 + strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free #endif .endm .global rasterizeGT_mode4_asm rasterizeGT_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] mov Lh, #0 // Lh = 0 @@ -330,20 +334,14 @@ rasterizeGT_mode4_asm: .scanline_end: ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24 - ldr tmp, [sp, #(SP_LDX + 16)] - add Lx, tmp // Lx += Ldx from stack - - ldr tmp, [sp, #(SP_LDG + 16)] - add Lg, tmp // Lg += Ldg from stack - - ldr tmp, [sp, #(SP_LDT + 16)] - add Lt, tmp // Lt += Ldt from stack - - ldr tmp, [sp, #(SP_RDX + 16)] - add Rx, tmp // Rx += Rdx from stack + add tmp, sp, #16 + ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg} - ldr tmp, [sp, #(SP_RDG + 16)] - add Rg, tmp // Rg += Rdg from stack + add Lx, sLdx + add Lg, sLdg + add Lt, sLdt + add Rx, sRdx + add Rg, sRdg ldr tmp, [sp, #(SP_RDT + 16)] add Rt, tmp // Rt += Rdt from stack @@ -358,4 +356,4 @@ rasterizeGT_mode4_asm: .exit: add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/rasterizeG_mode4.s b/src/platform/gba/rasterizeG_mode4.s index f858214d..dec63f99 100644 --- a/src/platform/gba/rasterizeG_mode4.s +++ b/src/platform/gba/rasterizeG_mode4.s @@ -9,11 +9,11 @@ tmp .req r5 N .req r6 Lh .req r7 Rh .req r8 -Lx .req ip -Rx .req lr -Lg .req r9 -Rg .req r10 -h .req r11 +Lx .req r9 +Rx .req r10 +Lg .req r11 +Rg .req r12 +h .req lr Ldx .req h Rdx .req Ldx Ldg .req Ldx @@ -29,6 +29,11 @@ width .req Rh g .req L dgdx .req R +sLdx .req L +sLdg .req R +sRdx .req Lh +sRdg .req Rh + SP_LDX = 0 SP_LDG = 4 SP_RDX = 8 @@ -44,7 +49,7 @@ SP_RDG = 12 .global rasterizeG_mode4_asm rasterizeG_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} sub sp, #16 // reserve stack space for [Ldx, Ldg, Rdx, Rdg] mov tmp, #LMAP_ADDR @@ -188,17 +193,12 @@ rasterizeG_mode4_asm: bne .scanline_block_4px .scanline_end: - ldr tmp, [sp, #(SP_LDX + 16)] - add Lx, tmp // Lx += Ldx from stack - - ldr tmp, [sp, #(SP_LDG + 16)] - add Lg, tmp // Lg += Ldg from stack - - ldr tmp, [sp, #(SP_RDX + 16)] - add Rx, tmp // Rx += Rdx from stack - - ldr tmp, [sp, #(SP_RDG + 16)] - add Rg, tmp // Rg += Rdg from stack + add tmp, sp, #16 + ldmia tmp, {sLdx, sLdg, sRdx, sRdg} + add Lx, sLdx + add Lg, sLdg + add Rx, sRdx + add Rg, sRdg add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240) @@ -210,4 +210,4 @@ rasterizeG_mode4_asm: .exit: add sp, #16 // revert reserved space for [Ldx, Ldg, Rdx, Rdg] - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/rasterizeS_mode4.s b/src/platform/gba/rasterizeS_mode4.s index a1b94f4f..380d660f 100644 --- a/src/platform/gba/rasterizeS_mode4.s +++ b/src/platform/gba/rasterizeS_mode4.s @@ -6,14 +6,14 @@ R .req r2 LMAP .req r3 Lh .req r4 Rh .req r5 -Lx .req ip -Rx .req lr -Ldx .req r6 -Rdx .req r7 -N .req r8 -tmp .req r9 -DIVLUT .req r10 -width .req r11 +Lx .req r6 +Rx .req r7 +Ldx .req r8 +Rdx .req r9 +N .req r10 +tmp .req r11 +DIVLUT .req r12 +width .req lr h .req N Ry1 .req tmp Ry2 .req Rh @@ -28,7 +28,7 @@ indexB .req DIVLUT .global rasterizeS_mode4_asm rasterizeS_mode4_asm: - stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} + stmfd sp!, {r4-r11, lr} ldr LMAP, .shadow_lightmap @@ -149,4 +149,4 @@ rasterizeS_mode4_asm: b .loop .exit: - ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc} \ No newline at end of file + ldmfd sp!, {r4-r11, pc} \ No newline at end of file diff --git a/src/platform/gba/transformMesh.s b/src/platform/gba/transformMesh.s index cb02f2bc..ca555509 100644 --- a/src/platform/gba/transformMesh.s +++ b/src/platform/gba/transformMesh.s @@ -71,7 +71,7 @@ transformMesh_asm: mla y, mx, vx, y mla y, my, vy, y mla y, mz, vz, y - mov y, y, asr #(FIXED_SHIFT - PROJ_SHIFT) + mov y, y, asr #FIXED_SHIFT // transform z ldmia m!, {mx, my, mz, z} @@ -99,21 +99,23 @@ transformMesh_asm: mul x, dz, x mul y, dz, y mov x, x, asr #(16 - PROJ_SHIFT) - // keep y shifted by 16 for min/max cmp + mov y, y, asr #(16 - PROJ_SHIFT) // viewport clipping ldmia sp, {minXY, maxXY} cmp x, minXY, asr #16 orrle vg, vg, #CLIP_LEFT - cmp y, minXY, lsl #16 - orrle vg, vg, #CLIP_TOP cmp x, maxXY, asr #16 orrge vg, vg, #CLIP_RIGHT - cmp y, maxXY, lsl #16 - orrge vg, vg, #CLIP_BOTTOM - mov y, y, asr #16 + mov minXY, minXY, lsl #16 + mov maxXY, maxXY, lsl #16 + + cmp y, minXY, asr #16 + orrle vg, vg, #CLIP_TOP + cmp y, maxXY, asr #16 + orrge vg, vg, #CLIP_BOTTOM add x, x, #(FRAME_WIDTH >> 1) add y, y, #(FRAME_HEIGHT >> 1) diff --git a/src/platform/gba/transformRoom.s b/src/platform/gba/transformRoom.s index f318d4be..1c350918 100644 --- a/src/platform/gba/transformRoom.s +++ b/src/platform/gba/transformRoom.s @@ -85,7 +85,7 @@ transformRoom_asm: mla y, mx, vx, y mla y, my, vy, y mla y, mz, vz, y - mov y, y, asr #(FIXED_SHIFT - PROJ_SHIFT) + mov y, y, asr #FIXED_SHIFT // transform x ldmdb m!, {mx, my, mz, x} @@ -121,21 +121,23 @@ transformRoom_asm: mul x, dz, x mul y, dz, y mov x, x, asr #(16 - PROJ_SHIFT) - // keep y shifted by 16 for min/max cmp + mov y, y, asr #(16 - PROJ_SHIFT) // viewport clipping ldmia sp, {m, minXY, maxXY} // preload matrix cmp x, minXY, asr #16 orrle vg, vg, #CLIP_LEFT - cmp y, minXY, lsl #16 - orrle vg, vg, #CLIP_TOP cmp x, maxXY, asr #16 orrge vg, vg, #CLIP_RIGHT - cmp y, maxXY, lsl #16 - orrge vg, vg, #CLIP_BOTTOM - mov y, y, asr #16 + mov minXY, minXY, lsl #16 + mov maxXY, maxXY, lsl #16 + + cmp y, minXY, asr #16 + orrle vg, vg, #CLIP_TOP + cmp y, maxXY, asr #16 + orrge vg, vg, #CLIP_BOTTOM add x, x, #(FRAME_WIDTH >> 1) add y, y, #(FRAME_HEIGHT >> 1)