From 3ba5ec38183bec7194729b4e8d09d36f0d5801d9 Mon Sep 17 00:00:00 2001
From: XProger <xproger@list.ru>
Date: Sun, 4 Dec 2022 03:15:53 +0300
Subject: [PATCH] #368 GBA matrixRotateYXZ fetch sincos address lut once

---
 src/platform/gba/asm/faceAddMeshQuads.s     | 13 +++++------
 src/platform/gba/asm/faceAddMeshTriangles.s | 11 +++++-----
 src/platform/gba/asm/matrixRotate.s         | 24 +++++++++++++++------
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/platform/gba/asm/faceAddMeshQuads.s b/src/platform/gba/asm/faceAddMeshQuads.s
index ca035a1a..8d54b55c 100644
--- a/src/platform/gba/asm/faceAddMeshQuads.s
+++ b/src/platform/gba/asm/faceAddMeshQuads.s
@@ -49,9 +49,10 @@ faceAddMeshQuads_asm:
 
 .loop:
     ldrh vp0, [polys], #2
+    ldrh vp2, [polys], #4   // + flags
+
     lsr vp1, vp0, #8
     and vp0, #0xFF
-    ldrh vp2, [polys], #4   // + flags
     lsr vp3, vp2, #8
     and vp2, #0xFF
 
@@ -85,11 +86,11 @@ faceAddMeshQuads_asm:
     orrne flags, #FACE_CLIPPED
 
     // depth = AVG_Z4
-    lsl vg0, #16
-    add depth, vg0, vg1, lsl #16
-    add depth, vg2, lsl #16
-    add depth, vg3, lsl #16
-    lsr depth, #(16 + 2)
+    lsl vg0, #16                    // clip g part (high half)
+    add depth, vg0, vg1, lsl #16    // depth = vz0 + vz1
+    add depth, vg2, lsl #16         // depth += vz2
+    add depth, vg3, lsl #16         // depth += vz3
+    lsr depth, #(16 + 2)            // dpeth /= 4
 
     // faceAdd
     rsb vp0, vertices, vp0, lsr #3
diff --git a/src/platform/gba/asm/faceAddMeshTriangles.s b/src/platform/gba/asm/faceAddMeshTriangles.s
index 87c10bd4..38d49a54 100644
--- a/src/platform/gba/asm/faceAddMeshTriangles.s
+++ b/src/platform/gba/asm/faceAddMeshTriangles.s
@@ -47,9 +47,10 @@ faceAddMeshTriangles_asm:
 
 .loop:
     ldrh vp0, [polys], #2
+    ldrh vp2, [polys], #4   // + flags
+
     lsr vp1, vp0, #8
     and vp0, #0xFF
-    ldrh vp2, [polys], #4   // + flags
     and vp2, #0xFF
 
     add vp0, vp, vp0, lsl #3
@@ -77,10 +78,10 @@ faceAddMeshTriangles_asm:
     orrne flags, #FACE_CLIPPED
 
     // depth = AVG_Z3
-    lsl vg0, #16
-    add depth, vg0, vg1, lsl #16
-    add depth, vg2, lsl #17
-    lsr depth, #(16 + 2)
+    lsl vg0, #16                    // clip g part (high half)
+    add depth, vg0, vg1, lsl #16    // depth = vz0 + vz1
+    add depth, vg2, lsl #17         // depth += vz2 * 2
+    lsr depth, #(16 + 2)            // depth /= 4
 
     // faceAdd
     rsb vp0, vertices, vp0, lsr #3
diff --git a/src/platform/gba/asm/matrixRotate.s b/src/platform/gba/asm/matrixRotate.s
index e7776af3..114b4fc0 100644
--- a/src/platform/gba/asm/matrixRotate.s
+++ b/src/platform/gba/asm/matrixRotate.s
@@ -8,6 +8,13 @@
     mov \sin, \sin, asr #16
 .endm
 
+.macro sincosLUT lut, angle, sin, cos
+    ldr \sin, [\lut, \angle, lsl #2]
+    mov \cos, \sin, lsl #16
+    mov \cos, \cos, asr #16
+    mov \sin, \sin, asr #16
+.endm
+
 .macro rotxy x, y, sin, cos, t
     mul \t, \y, \cos
     mla \t, \x, \sin, \t
@@ -126,6 +133,7 @@ e00     .req r3
 e01     .req r4
 e02     .req r5
 e10     .req r6
+scLUT   .req r7
 // FIQ regs
 e11     .req r8
 e12     .req r9
@@ -151,14 +159,16 @@ matrixRotateYXZ_asm:
     and angleY, mask, angleY, lsr #4
     and angleZ, mask, angleZ, lsr #4
 
-matrixRotateYXZ_fast_asm:
+matrixRotateYXZ_fast_asm:   // routine for pre-shifted angles
     orr mask, angleX, angleY
     orrs mask, mask, angleZ
     bxeq lr
 
-    stmfd sp!, {r4-r6}
+    stmfd sp!, {r4-r7}
     fiq_on
 
+    ldr scLUT, =gSinCosTable
+
     ldr mm, =gMatrixPtr
     ldr mm, [mm]
     ldmia mm, {e00, e01, e02}
@@ -171,7 +181,7 @@ matrixRotateYXZ_fast_asm:
     cmp angleY, #0
     beq .rotX
 
-    sincos angleY, sinY, cosY
+    sincosLUT scLUT, angleY, sinY, cosY
 
     rotxy e00, e02, sinY, cosY, tmp
     rotxy e10, e12, sinY, cosY, tmp
@@ -181,7 +191,7 @@ matrixRotateYXZ_fast_asm:
     cmp angleX, #0
     beq .rotZ
 
-    sincos angleX, sinX, cosX
+    sincosLUT scLUT, angleX, sinX, cosX
 
     rotxy e02, e01, sinX, cosX, tmp
     rotxy e12, e11, sinX, cosX, tmp
@@ -191,13 +201,13 @@ matrixRotateYXZ_fast_asm:
     cmp angleZ, #0
     beq .done
 
-    sincos angleZ, sinZ, cosZ
+    sincosLUT scLUT, angleZ, sinZ, cosZ
 
     rotxy e01, e00, sinZ, cosZ, tmp
     rotxy e11, e10, sinZ, cosZ, tmp
     rotxy e21, e20, sinZ, cosZ, tmp
 
-.done:  
+.done:
     ldr mm, =gMatrixPtr
     ldr mm, [mm]
 
@@ -208,7 +218,7 @@ matrixRotateYXZ_fast_asm:
     stmia mm, {e20, e21, e22}
 
     fiq_off
-    ldmfd sp!, {r4-r6}
+    ldmfd sp!, {r4-r7}
     bx lr
 
 q   .req r0     // arg