From b1a559002c4fec3940fd7fb0a20f7d78ab5e4ec9 Mon Sep 17 00:00:00 2001 From: XProger Date: Thu, 1 Dec 2022 17:50:11 +0300 Subject: [PATCH] #368 GBA audio processing optimizations, enable VRAM meshes, cleanup rendering constants --- src/fixed/common.h | 11 ++-- src/platform/gba/asm/sndIMA.s | 46 +++++++------ src/platform/gba/asm/sndPCM.s | 106 +++++++++++++++++++----------- src/platform/gba/main.cpp | 4 +- src/platform/gba/rasterizer.h | 14 ++-- src/platform/gba/render.iwram.cpp | 10 +-- src/platform/gba/sound.cpp | 65 ++++++++++-------- 7 files changed, 149 insertions(+), 107 deletions(-) diff --git a/src/fixed/common.h b/src/fixed/common.h index cca4fa67..51898eb7 100644 --- a/src/fixed/common.h +++ b/src/fixed/common.h @@ -66,6 +66,7 @@ #define FRAME_HEIGHT 160 #define USE_FMT (LVL_FMT_PKD) + #define USE_VRAM_MESH // experimental #include #elif defined(__NDS__) @@ -189,8 +190,6 @@ #include #include -#define VRAM_WIDTH (FRAME_WIDTH/2) // in shorts - #ifndef USE_FMT #define USE_FMT (LVL_FMT_PHD | LVL_FMT_PSX | LVL_FMT_SAT | LVL_FMT_TR2 | LVL_FMT_TR4) #endif @@ -343,13 +342,13 @@ X_INLINE int32 abs(int32 x) { #endif #if defined(__GBA_WIN__) - extern uint16 fb[VRAM_WIDTH * FRAME_HEIGHT]; + extern uint16 fb[FRAME_WIDTH * FRAME_HEIGHT]; #elif defined(__GBA__) extern uint32 fb; #elif defined(__TNS__) - extern uint16 fb[VRAM_WIDTH * FRAME_HEIGHT]; + extern uint16 fb[FRAME_WIDTH * FRAME_HEIGHT]; #elif defined(__DOS__) - extern uint16 fb[VRAM_WIDTH * FRAME_HEIGHT]; + extern uint16 fb[FRAME_WIDTH * FRAME_HEIGHT]; #endif #define STATIC_MESH_FLAG_NO_COLLISION 1 @@ -2839,7 +2838,7 @@ int32 doTutorial(ItemObj* lara, int32 track); void sndInit(); void sndInitSamples(); void sndFreeSamples(); -void sndFill(int8* buffer, int32 count); +void sndFill(int8* buffer); void* sndPlaySample(int32 index, int32 volume, int32 pitch, int32 mode); void sndPlayTrack(int32 track); bool sndTrackIsPlaying(); diff --git a/src/platform/gba/asm/sndIMA.s b/src/platform/gba/asm/sndIMA.s index 18ee918a..e7f43480 100644 --- a/src/platform/gba/asm/sndIMA.s +++ b/src/platform/gba/asm/sndIMA.s @@ -10,49 +10,53 @@ stepLUT .req r6 step .req r7 n .req r8 index .req r9 +mask .req r10 out .req r12 tmp .req out +diff .req step IMA_STEP_SIZE = 88 -.macro decode4 n, out +.macro ima_decode ldr step, [stepLUT, idx, lsl #2] - and index, \n, #7 - mov tmp, step, lsl #1 - mla step, index, tmp, step - tst \n, #8 - subne smp, smp, step, lsr #3 - addeq smp, smp, step, lsr #3 + mul tmp, step, index + add diff, tmp, lsl #1 + + subne smp, diff, lsr #3 + addeq smp, diff, lsr #3 subs index, #3 - suble idx, idx, #1 - bicle idx, idx, idx, asr #31 - addgt idx, idx, index, lsl #1 - cmpgt idx, #IMA_STEP_SIZE + suble idx, #1 + addgt idx, index, lsl #1 + + // clamp 0..88 + bic idx, idx, asr #31 + cmp idx, #IMA_STEP_SIZE movgt idx, #IMA_STEP_SIZE - mov \out, smp, asr #(2 + SND_VOL_SHIFT) + mov out, smp, asr #(2 + SND_VOL_SHIFT) + strb out, [buffer], #1 .endm -.global sndIMA_asm -sndIMA_asm: +.global sndIMA_fill_asm +sndIMA_fill_asm: stmfd sp!, {r4-r9} ldmia state, {smp, idx} - ldr stepLUT, =IMA_STEP + mov mask, #7 .loop: ldrb n, [data], #1 - decode4 n, out - strb out, [buffer], #1 - - mov n, n, lsr #4 + and index, mask, n + tst n, #8 + ima_decode - decode4 n, out - strb out, [buffer], #1 + and index, mask, n, lsr #4 + tst n, #(8 << 4) + ima_decode subs size, #1 bne .loop diff --git a/src/platform/gba/asm/sndPCM.s b/src/platform/gba/asm/sndPCM.s index 1c2d4353..dfc5492f 100644 --- a/src/platform/gba/asm/sndPCM.s +++ b/src/platform/gba/asm/sndPCM.s @@ -7,61 +7,87 @@ volume .req r3 data .req r4 buffer .req r5 -count .req r6 -ampA .req r7 -ampB .req r8 -outA .req r9 -outB .req r12 -last .req count -tmpSP .req outB -tmp .req ampA - -.macro clamp amp +tmp .req r6 +last .req r12 +tmpSP .req last +out .req size + +.macro clamp // Vanadium's clamp trick (-128..127) - mov tmp, \amp, asr #31 // tmp <- 0xffffffff - cmp tmp, \amp, asr #7 // not equal - eorne \amp, tmp, #0x7F // amp <- 0xffffff80 + mov tmp, out, asr #31 // tmp <- 0xffffffff + cmp tmp, out, asr #7 // not equal + eorne out, tmp, #0x7F // out <- 0xffffff80 +.endm + +.macro calc_last + // last = pos + inc * SND_SAMPLES (176) + add last, inc, inc, lsl #2 // last = inc * 5 + add last, inc, last, lsl #1 // last = inc * 11 + add last, pos, last, lsl #4 // last = pos + (inc * 11) * 16 +.endm + +.macro pcm_sample_fetch + ldrb out, [data, pos, lsr #SND_FIXED_SHIFT] + add pos, inc + sub out, #128 + mul out, volume +.endm + +.macro pcm_sample_fill + pcm_sample_fetch + asr out, #SND_VOL_SHIFT + strb out, [buffer], #1 +.endm + +.macro pcm_sample_mix + pcm_sample_fetch + ldrsb tmp, [buffer] + add out, tmp, out, asr #SND_VOL_SHIFT + clamp + strb out, [buffer], #1 .endm -.global sndPCM_asm -sndPCM_asm: +.global sndPCM_fill_asm +sndPCM_fill_asm: mov tmpSP, sp - stmfd sp!, {r4-r9} + stmfd sp!, {r4-r5} - ldmia tmpSP, {data, buffer, count} + ldmia tmpSP, {data, buffer} + + calc_last - mla last, inc, count, pos cmp last, size movgt last, size -.loop: - ldrb ampA, [data, pos, lsr #SND_FIXED_SHIFT] - add pos, pos, inc - ldrb ampB, [data, pos, lsr #SND_FIXED_SHIFT] - add pos, pos, inc +.loop_fill: + pcm_sample_fill + pcm_sample_fill - // can't use signed PCM because of LDRSB restrictions - sub ampA, ampA, #128 - sub ampB, ampB, #128 + cmp pos, last + blt .loop_fill - mul ampA, volume - mul ampB, volume + ldmfd sp!, {r4-r5} + bx lr - ldrsb outA, [buffer, #0] - ldrsb outB, [buffer, #1] - add outA, ampA, asr #SND_VOL_SHIFT - add outB, ampB, asr #SND_VOL_SHIFT +.global sndPCM_mix_asm +sndPCM_mix_asm: + mov tmpSP, sp + stmfd sp!, {r4-r6} // tmp reg required - clamp outA - clamp outB + ldmia tmpSP, {data, buffer} - strb outA, [buffer], #1 - strb outB, [buffer], #1 + calc_last + + cmp last, size + movgt last, size + +.loop_mix: + pcm_sample_mix + pcm_sample_mix cmp pos, last - blt .loop + blt .loop_mix -.done: - ldmfd sp!, {r4-r9} - bx lr + ldmfd sp!, {r4-r6} + bx lr \ No newline at end of file diff --git a/src/platform/gba/main.cpp b/src/platform/gba/main.cpp index 9c5102cf..301948c1 100644 --- a/src/platform/gba/main.cpp +++ b/src/platform/gba/main.cpp @@ -123,7 +123,7 @@ void soundFill() { WAVEHDR *waveHdr = waveBuf + curSoundBuffer; waveOutUnprepareHeader(waveOut, waveHdr, sizeof(WAVEHDR)); - sndFill((int8*)waveHdr->lpData, SND_SAMPLES); + sndFill((int8*)waveHdr->lpData); waveOutPrepareHeader(waveOut, waveHdr, sizeof(WAVEHDR)); waveOutWrite(waveOut, waveHdr, sizeof(WAVEHDR)); curSoundBuffer ^= 1; @@ -503,7 +503,7 @@ void soundFill() REG_DMA1CNT = DMA_DST_FIXED | DMA_REPEAT | DMA_16 | DMA_AT_FIFO | DMA_ENABLE; } - sndFill(soundBuffer + curSoundBuffer, SND_SAMPLES); + sndFill(soundBuffer + curSoundBuffer); curSoundBuffer ^= SND_SAMPLES; } diff --git a/src/platform/gba/rasterizer.h b/src/platform/gba/rasterizer.h index e8643e27..d9f1c33c 100644 --- a/src/platform/gba/rasterizer.h +++ b/src/platform/gba/rasterizer.h @@ -143,7 +143,7 @@ void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R) } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -253,7 +253,7 @@ void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R) } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -377,7 +377,7 @@ void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R) } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -533,7 +533,7 @@ void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R) } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -672,7 +672,7 @@ void rasterizeFTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R) } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -846,7 +846,7 @@ void rasterizeGTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R) } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -970,7 +970,7 @@ extern "C" X_NOINLINE void rasterizeSprite_c(uint16* pixel, const VertexLink* L, if (L->v.y < 0) { - pixel -= L->v.y * VRAM_WIDTH; + pixel -= L->v.y * (FRAME_WIDTH >> 1); v -= L->v.y * dv; h += L->v.y; } diff --git a/src/platform/gba/render.iwram.cpp b/src/platform/gba/render.iwram.cpp index 9bbff31c..f8631bac 100644 --- a/src/platform/gba/render.iwram.cpp +++ b/src/platform/gba/render.iwram.cpp @@ -26,13 +26,13 @@ struct ViewportRel { ViewportRel viewportRel; #if defined(__GBA_WIN__) - uint16 fb[VRAM_WIDTH * FRAME_HEIGHT]; + uint16 fb[FRAME_WIDTH * FRAME_HEIGHT]; #elif defined(__GBA__) uint32 fb = MEM_VRAM; #elif defined(__TNS__) - uint16 fb[VRAM_WIDTH * FRAME_HEIGHT]; + uint16 fb[FRAME_WIDTH * FRAME_HEIGHT]; #elif defined(__DOS__) - uint16 fb[VRAM_WIDTH * FRAME_HEIGHT]; + uint16 fb[FRAME_WIDTH * FRAME_HEIGHT]; #endif enum FaceType { @@ -150,7 +150,7 @@ extern "C" { #define faceAddMeshTriangles faceAddMeshTriangles_c #define rasterize rasterize_c -X_INLINE bool checkBackface(const Vertex *a, const Vertex *b, const Vertex *c) +X_INLINE bool checkBackface(const Vertex* a, const Vertex* b, const Vertex* c) { return (b->x - a->x) * (c->y - a->y) <= (c->x - a->x) * (b->y - a->y); } @@ -803,7 +803,7 @@ void faceAddMesh(const MeshQuad* quads, const MeshTriangle* triangles, int32 qCo void clear() { - dmaFill((void*)fb, 0, VRAM_WIDTH * FRAME_HEIGHT * 2); + dmaFill((void*)fb, 0, FRAME_WIDTH * FRAME_HEIGHT); } void renderRoom(const Room* room) diff --git a/src/platform/gba/sound.cpp b/src/platform/gba/sound.cpp index 9d7c0b50..8dbb8eae 100644 --- a/src/platform/gba/sound.cpp +++ b/src/platform/gba/sound.cpp @@ -24,16 +24,19 @@ int32 IMA_STEP[] = { // IWRAM ! int8 soundBuffer[2 * SND_SAMPLES + 32]; // 32 bytes of silence for DMA overrun while interrupt #ifdef USE_ASM - #define sndIMA sndIMA_asm - #define sndPCM sndPCM_asm + #define sndIMA_fill sndIMA_fill_asm + #define sndPCM_fill sndPCM_fill_asm + #define sndPCM_mix sndPCM_mix_asm extern "C" { - void sndIMA_asm(IMA_STATE &state, int8* buffer, const uint8* data, int32 size); - int32 sndPCM_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer, int32 count); + void sndIMA_fill_asm(IMA_STATE &state, int8* buffer, const uint8* data, int32 size); + int32 sndPCM_fill_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer); + int32 sndPCM_mix_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer); } #else - #define sndIMA sndIMA_c - #define sndPCM sndPCM_c + #define sndIMA_fill sndIMA_c + #define sndPCM_fill sndPCM_c + #define sndPCM_mix sndPCM_c #define DECODE_IMA_4(n)\ step = IMA_STEP[idx];\ @@ -72,16 +75,16 @@ void sndIMA_c(IMA_STATE &state, int8* buffer, const uint8* data, int32 size) state.idx = idx; } -int32 sndPCM_c(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer, int32 count) +int32 sndPCM_c(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer) { - int32 last = pos + count * inc; + int32 last = pos + SND_SAMPLES * inc; if (last > size) { last = size; } while (pos < last) { - int32 amp = SND_DECODE(*buffer) + ((SND_DECODE(data[pos >> SND_FIXED_SHIFT]) * volume) >> SND_VOL_SHIFT); + int32 amp = SND_DECODE(*(uint8*)buffer) + ((SND_DECODE(data[pos >> SND_FIXED_SHIFT]) * volume) >> SND_VOL_SHIFT); *buffer++ = SND_ENCODE(X_CLAMP(amp, SND_MIN, SND_MAX)); pos += inc; } @@ -97,18 +100,18 @@ struct Music int32 pos; IMA_STATE state; - void fill(int8* buffer, int32 count) + void fill(int8* buffer) { - int32 len = X_MIN(size - pos, count >> 1); + int32 len = X_MIN(size - pos, SND_SAMPLES >> 1); - sndIMA(state, buffer, data + pos, len); + sndIMA_fill(state, buffer, data + pos, len); pos += len; if (pos >= size) { data = NULL; - memset(buffer, 0, (count - (len << 1)) * sizeof(buffer[0])); + memset(buffer, 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0])); } } }; @@ -121,9 +124,19 @@ struct Sample int32 volume; const uint8* data; - void fill(int8* buffer, int32 count) + void mix(int8* buffer) { - pos = sndPCM(pos, inc, size, volume, data, buffer, count); + pos = sndPCM_mix(pos, inc, size, volume, data, buffer); + + if (pos >= size) + { + data = NULL; + } + } + + void fill(int8* buffer) + { + pos = sndPCM_fill(pos, inc, size, volume, data, buffer); if (pos >= size) { @@ -262,23 +275,18 @@ void sndStop() music.data = NULL; } -void sndFill(int8* buffer, int32 count) +void sndFill(int8* buffer) { #ifdef PROFILE_SOUNDTIME PROFILE_CLEAR(); PROFILE(CNT_SOUND); #endif + bool mix = (music.data != NULL); - if ((channelsCount == 0) && !music.data) - { - dmaFill(buffer, SND_ENCODE(0), count); - return; - } - - if (music.data) { - music.fill(buffer, count); + if (mix) { + music.fill(buffer); } else { - dmaFill(buffer, 0, SND_SAMPLES * sizeof(buffer[0])); + dmaFill(buffer, SND_ENCODE(0), SND_SAMPLES * sizeof(buffer[0])); } int32 ch = channelsCount; @@ -286,10 +294,15 @@ void sndFill(int8* buffer, int32 count) { Sample* sample = channels + ch; - sample->fill(buffer, count); + if (mix) + sample->mix(buffer); + else + sample->fill(buffer); if (!sample->data) { channels[ch] = channels[--channelsCount]; } + + mix = true; } }