From 0e6f2db4edb251e09371aeea0d569700448f29f2 Mon Sep 17 00:00:00 2001 From: wgroeneveld Date: Fri, 10 Jul 2020 20:14:47 +0200 Subject: [PATCH] move to toncs faster line rendering algorithms --- engine/CMakeLists.txt | 17 +- .../libgba-sprite-engine/gba/tonc_core.h | 55 +---- .../libgba-sprite-engine/gba/tonc_video.h | 49 ++++ .../libgba-sprite-engine/gba/toolbox.h | 18 ++ .../include/libgba-sprite-engine/gba_engine.h | 9 +- engine/src/gba/tonc_bmp8.cpp | 206 +++++++++++++++++ engine/src/gba/tonc_memset.s | 216 ++++++++++++++++++ engine/src/gba_engine.cpp | 34 +-- 8 files changed, 510 insertions(+), 94 deletions(-) create mode 100644 engine/include/libgba-sprite-engine/gba/tonc_video.h create mode 100644 engine/include/libgba-sprite-engine/gba/toolbox.h create mode 100644 engine/src/gba/tonc_bmp8.cpp create mode 100644 engine/src/gba/tonc_memset.s diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt index 0181b7d..81a1b31 100644 --- a/engine/CMakeLists.txt +++ b/engine/CMakeLists.txt @@ -1,16 +1,23 @@ project(gba-bitmap-engine) -set_property(SOURCE src/gba/sin_lut.s PROPERTY LANGUAGE C) -set_property(SOURCE src/gba/tonc_bios.s PROPERTY LANGUAGE C) -set_property(SOURCE src/background/tonc_font.s PROPERTY LANGUAGE C) -set_source_files_properties(src/gba/tonc_bios.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") -set_source_files_properties(src/background/tonc_font.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") +set_property(SOURCE + src/gba/sin_lut.s + src/gba/tonc_bios.s + src/gba/tonc_memset.s + src/background/tonc_font.s PROPERTY LANGUAGE C) +set_source_files_properties( + src/gba/tonc_bios.s + src/gba/tonc_memset.s + src/background/tonc_font.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") +set_source_files_properties(src/gba/tonc_bmp8.cpp PROPERTIES COMPILE_FLAGS "-Wno-pointer-arith") add_library(${PROJECT_NAME} src/palette/palette_manager.cpp src/palette/combined_palette.cpp src/background/text_stream.cpp src/gba/sin_lut.s + src/gba/tonc_memset.s src/gba/tonc_bios.s + src/gba/tonc_bmp8.cpp src/background/tonc_font.s src/gba_engine.cpp src/math.cpp diff --git a/engine/include/libgba-sprite-engine/gba/tonc_core.h b/engine/include/libgba-sprite-engine/gba/tonc_core.h index c3ce2f3..a98babb 100644 --- a/engine/include/libgba-sprite-engine/gba/tonc_core.h +++ b/engine/include/libgba-sprite-engine/gba/tonc_core.h @@ -148,58 +148,11 @@ INLINE void *toncset32(void *dst, u32 src, uint count); // Fast memcpy/set -void memset16(void *dst, u16 hw, uint hwcount); -void memcpy16(void *dst, const void* src, uint hwcount); +extern "C" void memset16(void *dst, u16 hw, uint hwcount); +extern "C" void memcpy16(void *dst, const void* src, uint hwcount); -IWRAM_CODE void memset32(void *dst, u32 wd, uint wcount); -IWRAM_CODE void memcpy32(void *dst, const void* src, uint wcount); - - -//! Fastfill for halfwords, analogous to memset() -/*! Uses memset32() if \a hwcount>5 -* \param dst Destination address. -* \param hw Source halfword (not address). -* \param hwcount Number of halfwords to fill. -* \note \a dst must be halfword aligned. -* \note \a r0 returns as \a dst + \a hwcount*2. -*/ -void memset16(void *dst, u16 hw, uint hwcount); - -//! \brief Copy for halfwords. -/*! Uses memcpy32() if \a hwn>6 and - \a src and \a dst are aligned equally. - \param dst Destination address. - \param src Source address. - \param hwcount Number of halfwords to fill. - \note \a dst and \a src must be halfword aligned. - \note \a r0 and \a r1 return as - \a dst + \a hwcount*2 and \a src + \a hwcount*2. -*/ -void memcpy16(void *dst, const void* src, uint hwcount); - - -//! Fast-fill by words, analogous to memset() -/*! Like CpuFastSet(), only without the requirement of - 32byte chunks and no awkward store-value-in-memory-first issue. - \param dst Destination address. - \param wd Fill word (not address). - \param wdcount Number of words to fill. - \note \a dst must be word aligned. - \note \a r0 returns as \a dst + \a wdcount*4. -*/ -IWRAM_CODE void memset32(void *dst, u32 wd, uint wdcount); - - -//! \brief Fast-copy by words. -/*! Like CpuFastFill(), only without the requirement of 32byte chunks - \param dst Destination address. - \param src Source address. - \param wdcount Number of words. - \note \a src and \a dst must be word aligned. - \note \a r0 and \a r1 return as - \a dst + \a wdcount*4 and \a src + \a wdcount*4. -*/ -IWRAM_CODE void memcpy32(void *dst, const void* src, uint wdcount); +extern "C" IWRAM_CODE void memset32(void *dst, u32 wd, uint wcount); +extern "C" IWRAM_CODE void memcpy32(void *dst, const void* src, uint wcount); //\} diff --git a/engine/include/libgba-sprite-engine/gba/tonc_video.h b/engine/include/libgba-sprite-engine/gba/tonc_video.h new file mode 100644 index 0000000..3c57f7a --- /dev/null +++ b/engine/include/libgba-sprite-engine/gba/tonc_video.h @@ -0,0 +1,49 @@ +// +// Created by Wouter Groeneveld on 10/07/20. +// Excerpts from tonc_video - only taken which was needed (M4) +// + +#ifndef GBA_BITMAP_ENGINE_PROJECT_TONC_VIDEO_H +#define GBA_BITMAP_ENGINE_PROJECT_TONC_VIDEO_H + +#include +#include + +void bmp8_plot(int x, int y, u32 clr, void *dstBase, uint dstP); + +void bmp8_hline(int x1, int y, int x2, u32 clr, void *dstBase, uint dstP); +void bmp8_vline(int x, int y1, int y2, u32 clr, void *dstBase, uint dstP); +void bmp8_line(int x1, int y1, int x2, int y2, u32 clr, + void *dstBase, uint dstP); + +INLINE void m4_hline(int x1, int y, int x2, u8 clrid); +INLINE void m4_vline(int x, int y1, int y2, u8 clrid); +INLINE void m4_line(int x1, int y1, int x2, int y2, u8 clrid); +INLINE void m4_plot(int x, int y, u8 clrid); + +//! Plot a \a clrid pixel on the current mode 4 backbuffer +INLINE void m4_plot(int x, int y, u8 clrid) +{ + u16 *dst= &vid_page[(y*M4_WIDTH+x)>>1]; + if(x&1) + *dst= (*dst& 0xFF) | (clrid<<8); + else + *dst= (*dst&~0xFF) | clrid; +} + + +//! Draw a \a clrid colored horizontal line in mode 4. +INLINE void m4_hline(int x1, int y, int x2, u8 clrid) +{ bmp8_hline(x1, y, x2, clrid, vid_page, M4_WIDTH); } + + +//! Draw a \a clrid colored vertical line in mode 4. +INLINE void m4_vline(int x, int y1, int y2, u8 clrid) +{ bmp8_vline(x, y1, y2, clrid, vid_page, M4_WIDTH); } + + +//! Draw a \a clrid colored line in mode 4. +INLINE void m4_line(int x1, int y1, int x2, int y2, u8 clrid) +{ bmp8_line(x1, y1, x2, y2, clrid, vid_page, M4_WIDTH); } + +#endif //GBA_BITMAP_ENGINE_PROJECT_TONC_VIDEO_H diff --git a/engine/include/libgba-sprite-engine/gba/toolbox.h b/engine/include/libgba-sprite-engine/gba/toolbox.h new file mode 100644 index 0000000..e89b2ce --- /dev/null +++ b/engine/include/libgba-sprite-engine/gba/toolbox.h @@ -0,0 +1,18 @@ +// +// Created by Wouter Groeneveld on 10/07/20. +// + +#ifndef GBA_BITMAP_ENGINE_PROJECT_TOOLBOX_H +#define GBA_BITMAP_ENGINE_PROJECT_TOOLBOX_H + +#define GBA_SCREEN_WIDTH 240 +#define GBA_SCREEN_WIDTH_FX GBA_SCREEN_WIDTH << 8 +#define GBA_SCREEN_HEIGHT 160 +#define GBA_SCREEN_HEIGHT_FX GBA_SCREEN_HEIGHT << 8 + +#define M4_WIDTH GBA_SCREEN_WIDTH +#define M4_HEIGHT GBA_SCREEN_HEIGHT + +extern COLOR *vid_page; + +#endif //GBA_BITMAP_ENGINE_PROJECT_TOOLBOX_H diff --git a/engine/include/libgba-sprite-engine/gba_engine.h b/engine/include/libgba-sprite-engine/gba_engine.h index 7858899..468d613 100644 --- a/engine/include/libgba-sprite-engine/gba_engine.h +++ b/engine/include/libgba-sprite-engine/gba_engine.h @@ -15,16 +15,8 @@ #include "sound_control.h" #include "timer.h" -#define GBA_SCREEN_WIDTH 240 -#define GBA_SCREEN_WIDTH_FX GBA_SCREEN_WIDTH << 8 -#define GBA_SCREEN_HEIGHT 160 -#define GBA_SCREEN_HEIGHT_FX GBA_SCREEN_HEIGHT << 8 - - -#define M4_WIDTH 240 const unsigned int black[VRAM_PAGE_SIZE] = {}; -extern u16 *vid_page; class GBAEngine { private: @@ -74,6 +66,7 @@ public: for(int i = 0; i < times; i++){} } + inline void plotPixel(int x, int y, u8 clrId); inline void plotPixel(const VectorPx &pixel, u8 clrId); inline void plotLine(const VectorPx &point0, const VectorPx &point1, u8 clrId); }; diff --git a/engine/src/gba/tonc_bmp8.cpp b/engine/src/gba/tonc_bmp8.cpp new file mode 100644 index 0000000..4daaa07 --- /dev/null +++ b/engine/src/gba/tonc_bmp8.cpp @@ -0,0 +1,206 @@ +// +// Created by Wouter Groeneveld on 10/07/20. +// + +//! Plot a single pixel on a 8-bit buffer +#include +#include +#include + +/*! + \param x X-coord. + \param y Y-coord. + \param clr Color. + \param dstBase Canvas pointer (halfword-aligned plz). + \param dstP Canvas pitch in bytes. + \note Slow as fuck. Inline plotting functionality if possible. +*/ +void bmp8_plot(int x, int y, u32 clr, void *dstBase, uint dstP) +{ + u16 *dstD= (u16*)(dstBase+y*dstP+(x&~1)); + + if(x&1) + *dstD= (*dstD& 0xFF) | (clr<<8); + else + *dstD= (*dstD&~0xFF) | (clr&0xFF); +} + + +//! Draw a horizontal line on an 8bit buffer +/*! + \param x1 First X-coord. + \param y Y-coord. + \param x2 Second X-coord. + \param clr Color index. + \param dstBase Canvas pointer (halfword-aligned plz). + \param dstP canvas pitch in bytes. + \note Does normalization, but not bounds checks. +*/ +void bmp8_hline(int x1, int y, int x2, u32 clr, void *dstBase, uint dstP) +{ + // --- Normalize --- + clr &= 0xFF; + if(x2x2) + { xstep= -1; dx= x1-x2; } + else + { xstep= +1; dx= x2-x1; } + + if(y1>y2) + { ystep= -dstP; dy= y1-y2; } + else + { ystep= +dstP; dy= y2-y1; } + + + // --- Drawing --- + // NOTE: because xstep is alternating, you can do marvels + // with mask-flips + // NOTE: (mask>>31) is equivalent to (x&1) ? 0 : 1 + + if(dx>=dy) // Diagonal, slope <= 1 + { + dd= 2*dy - dx; + + for(ii=dx; ii>=0; ii--) + { + dstL= (u16*)(addr - (mask>>31)); + *dstL= (*dstL &~ mask) | (clr & mask); + + if(dd >= 0) + { dd -= 2*dx; addr += ystep; } + + dd += 2*dy; + addr += xstep; + mask = ~mask; + } + } + else // # Diagonal, slope > 1 + { + dd= 2*dx - dy; + + for(ii=dy; ii>=0; ii--) + { + dstL= (u16*)(addr - (mask>>31)); + *dstL= (*dstL &~ mask) | (clr & mask); + + if(dd >= 0) + { + dd -= 2*dy; + addr += xstep; + mask = ~mask; + } + + dd += 2*dx; + addr += ystep; + } + } +} + +INLINE void m4_line(int x1, int y1, int x2, int y2, u8 clrid) +{ bmp8_line(x1, y1, x2, y2, clrid, vid_page, M4_WIDTH); } diff --git a/engine/src/gba/tonc_memset.s b/engine/src/gba/tonc_memset.s new file mode 100644 index 0000000..86f4354 --- /dev/null +++ b/engine/src/gba/tonc_memset.s @@ -0,0 +1,216 @@ +// +// Alignment-safe and fast memset routines +// +//! \file tonc_memcpy.s +//! \author J Vijn +//! \date 20060508 - 20090801 +// +// === NOTES === +@ * 20050924: Lower overhead for all; reduced i-count for u16 loops. +@ * These are 16/32bit memset and memcpy. The 32bit versions are in +@ iwram for maximum effect and pretty much do what CpuFastSet does, +@ except that it'll work for non multiples of 8 words too. Speed +@ is as good as CpuFastSet, but with a little less overhead. +@ * The 16bit versions call the 32bit ones if possible and/or desirable. +@ They are thumb/ROM functions but did them in asm anyway because +@ GCC goes haywire with the use of registers resulting in a much +@ higher overhead (i.e., detrimental for low counts) +@ * Crossover with inline while(nn--) loops (not for(ii++), which are +@ much slower): +@ memset32: ~5 +@ memset16: ~8 + + .file "tonc_memset.s" + + +#define DEF_SIZE(_name) .size _name, .-_name + +//! \name Section definitions for assembly. +//\{ + +#define CSEC_TEXT .text //!< Standard code section directive. +#define CSEC_EWRAM .section .ewram , "ax", %progbits //!< EWRAM code section directive. +#define CSEC_IWRAM .section .iwram, "ax", %progbits //!< IWRAM code section directive. + +#define DSEC_DATA .data //must be word aligned. + \note \a r0 returns as \a dst + \a wdn. +*/ +/* Reglist: + r0, r1: dst, src + r2: wdn, then wdn>>3 + r3-r10: data buffer + r12: wdn&7 +*/ +BEGIN_FUNC_ARM(memset32, CSEC_IWRAM) + and r12, r2, #7 + movs r2, r2, lsr #3 + beq .Lres_set32 + push {r4-r9} + @ set 32byte chunks with 8fold xxmia + mov r3, r1 + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov r8, r1 + mov r9, r1 +.Lmain_set32: + stmia r0!, {r1, r3-r9} + subs r2, r2, #1 + bhi .Lmain_set32 + pop {r4-r9} + @ residual 0-7 words +.Lres_set32: + subs r12, r12, #1 + stmhsia r0!, {r1} + bhi .Lres_set32 + bx lr +END_FUNC(memset32) + +@ === void memset16(void *dst, u16 src, u32 hwn); ===================== +/*! \fn void memset16(void *dst, u16 src, u32 hwn); + \brief Fill for halfwords. + Uses memset32() if \a hwn>5 + \param dst Destination address. + \param src Source halfword (not address). + \param wdn Number of halfwords to fill. + \note \a dst must be halfword aligned. + \note \a r0 returns as \a dst + \a hwn. +*/ +/* Reglist: + r0, r1: dst, src + r2, r4: wdn + r3: tmp; and data buffer +*/ +BEGIN_FUNC_THUMB(memset16, CSEC_TEXT) + push {r4, lr} + @ under 6 hwords -> std set + cmp r2, #5 + bls .Ltail_set16 + @ dst not word aligned: copy 1 hword and align + lsl r3, r0, #31 + bcc .Lmain_set16 + strh r1, [r0] + add r0, #2 + sub r2, r2, #1 + @ Again, memset32 does the real work +.Lmain_set16: + lsl r4, r1, #16 + orr r1, r4 + lsl r4, r2, #31 + lsr r2, r2, #1 + ldr r3, =memset32 + bl .Llong_bl + @ NOTE: r0 is altered by memset32, but in exactly the right + @ way, so we can use is as is. r1 is now doubled though. + lsr r2, r4, #31 + beq .Lend_set16 + lsr r1, #16 +.Ltail_set16: + sub r2, #1 + bcc .Lend_set16 @ r2 was 0, bug out + lsl r2, r2, #1 +.Lres_set16: + strh r1, [r0, r2] + sub r2, r2, #2 + bcs .Lres_set16 +.Lend_set16: + pop {r4} + pop {r3} +.Llong_bl: + bx r3 +END_FUNC(memset16) + + +@ EOF diff --git a/engine/src/gba_engine.cpp b/engine/src/gba_engine.cpp index c162b6c..be7030a 100644 --- a/engine/src/gba_engine.cpp +++ b/engine/src/gba_engine.cpp @@ -3,6 +3,7 @@ // #include +#include #include #include #include @@ -145,42 +146,15 @@ void GBAEngine::flipPage() { REG_DISPCNT ^= DCNT_PAGE; } -// http://www.coranac.com/tonc/text/bitmaps.htm -// this thing is supposed to be very slow. see link above. inline void GBAEngine::plotPixel(const VectorPx &pixel, u8 clrId) { - u16 *dst = &vid_page[(pixel.y() * M4_WIDTH + pixel.x()) / 2]; - if(pixel.x() & 1) { - *dst = (*dst & 0xFF) | (clrId << 8); - } else { - *dst = (*dst & ~0xFF) | clrId; - } + m4_plot(pixel.x(), pixel.y(), clrId); } -// more or less 1-to-1: -// https://www.davrous.com/2013/06/14/tutorial-part-2-learning-how-to-write-a-3d-soft-engine-from-scratch-in-c-ts-or-js-drawing-lines-triangles/ inline void GBAEngine::plotLine(const VectorPx &point0, const VectorPx &point1, u8 clrId) { - int x0 = point0.x(); - int y0 = point0.y(); - int x1 = point1.x(); - int y1 = point1.y(); - - int dx = ABS(x1 - x0); - int dy = ABS(y1 - y0); - int sx = (x0 < x1) ? 1 : -1; - int sy = (y0 < y1) ? 1 : -1; - int err = dx - dy; - - while (true) { - plotPixel(VectorPx(x0, y0), clrId); - - if ((x0 == x1) && (y0 == y1)) break; - auto e2 = 2 * err; - if (e2 > -dy) { err -= dy; x0 += sx; } - if (e2 < dx) { err += dx; y0 += sy; } - } + // uses tonc's optimalization tricks to get 10 FPS extra compared to standard bline algorithms + m4_line(point0.x(), point0.y(), point1.x(), point1.y(), clrId); } - inline VectorPx GBAEngine::project(const VectorFx &coord, const MatrixFx &transMat) { auto point = MatrixFx::transformCoordinates(coord, transMat);