diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt
index 0181b7d..81a1b31 100644
--- a/engine/CMakeLists.txt
+++ b/engine/CMakeLists.txt
@@ -1,16 +1,23 @@
project(gba-bitmap-engine)
-set_property(SOURCE src/gba/sin_lut.s PROPERTY LANGUAGE C)
-set_property(SOURCE src/gba/tonc_bios.s PROPERTY LANGUAGE C)
-set_property(SOURCE src/background/tonc_font.s PROPERTY LANGUAGE C)
-set_source_files_properties(src/gba/tonc_bios.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
-set_source_files_properties(src/background/tonc_font.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+set_property(SOURCE
+ src/gba/sin_lut.s
+ src/gba/tonc_bios.s
+ src/gba/tonc_memset.s
+ src/background/tonc_font.s PROPERTY LANGUAGE C)
+set_source_files_properties(
+ src/gba/tonc_bios.s
+ src/gba/tonc_memset.s
+ src/background/tonc_font.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+set_source_files_properties(src/gba/tonc_bmp8.cpp PROPERTIES COMPILE_FLAGS "-Wno-pointer-arith")
add_library(${PROJECT_NAME}
src/palette/palette_manager.cpp
src/palette/combined_palette.cpp
src/background/text_stream.cpp
src/gba/sin_lut.s
+ src/gba/tonc_memset.s
src/gba/tonc_bios.s
+ src/gba/tonc_bmp8.cpp
src/background/tonc_font.s
src/gba_engine.cpp
src/math.cpp
diff --git a/engine/include/libgba-sprite-engine/gba/tonc_core.h b/engine/include/libgba-sprite-engine/gba/tonc_core.h
index c3ce2f3..a98babb 100644
--- a/engine/include/libgba-sprite-engine/gba/tonc_core.h
+++ b/engine/include/libgba-sprite-engine/gba/tonc_core.h
@@ -148,58 +148,11 @@ INLINE void *toncset32(void *dst, u32 src, uint count);
// Fast memcpy/set
-void memset16(void *dst, u16 hw, uint hwcount);
-void memcpy16(void *dst, const void* src, uint hwcount);
+extern "C" void memset16(void *dst, u16 hw, uint hwcount);
+extern "C" void memcpy16(void *dst, const void* src, uint hwcount);
-IWRAM_CODE void memset32(void *dst, u32 wd, uint wcount);
-IWRAM_CODE void memcpy32(void *dst, const void* src, uint wcount);
-
-
-//! Fastfill for halfwords, analogous to memset()
-/*! Uses memset32()
if \a hwcount>5
-* \param dst Destination address.
-* \param hw Source halfword (not address).
-* \param hwcount Number of halfwords to fill.
-* \note \a dst must be halfword aligned.
-* \note \a r0 returns as \a dst + \a hwcount*2.
-*/
-void memset16(void *dst, u16 hw, uint hwcount);
-
-//! \brief Copy for halfwords.
-/*! Uses memcpy32()
if \a hwn>6 and
- \a src and \a dst are aligned equally.
- \param dst Destination address.
- \param src Source address.
- \param hwcount Number of halfwords to fill.
- \note \a dst and \a src must be halfword aligned.
- \note \a r0 and \a r1 return as
- \a dst + \a hwcount*2 and \a src + \a hwcount*2.
-*/
-void memcpy16(void *dst, const void* src, uint hwcount);
-
-
-//! Fast-fill by words, analogous to memset()
-/*! Like CpuFastSet(), only without the requirement of
- 32byte chunks and no awkward store-value-in-memory-first issue.
- \param dst Destination address.
- \param wd Fill word (not address).
- \param wdcount Number of words to fill.
- \note \a dst must be word aligned.
- \note \a r0 returns as \a dst + \a wdcount*4.
-*/
-IWRAM_CODE void memset32(void *dst, u32 wd, uint wdcount);
-
-
-//! \brief Fast-copy by words.
-/*! Like CpuFastFill(), only without the requirement of 32byte chunks
- \param dst Destination address.
- \param src Source address.
- \param wdcount Number of words.
- \note \a src and \a dst must be word aligned.
- \note \a r0 and \a r1 return as
- \a dst + \a wdcount*4 and \a src + \a wdcount*4.
-*/
-IWRAM_CODE void memcpy32(void *dst, const void* src, uint wdcount);
+extern "C" IWRAM_CODE void memset32(void *dst, u32 wd, uint wcount);
+extern "C" IWRAM_CODE void memcpy32(void *dst, const void* src, uint wcount);
//\}
diff --git a/engine/include/libgba-sprite-engine/gba/tonc_video.h b/engine/include/libgba-sprite-engine/gba/tonc_video.h
new file mode 100644
index 0000000..3c57f7a
--- /dev/null
+++ b/engine/include/libgba-sprite-engine/gba/tonc_video.h
@@ -0,0 +1,49 @@
+//
+// Created by Wouter Groeneveld on 10/07/20.
+// Excerpts from tonc_video - only taken which was needed (M4)
+//
+
+#ifndef GBA_BITMAP_ENGINE_PROJECT_TONC_VIDEO_H
+#define GBA_BITMAP_ENGINE_PROJECT_TONC_VIDEO_H
+
+#include
+#include
+
+void bmp8_plot(int x, int y, u32 clr, void *dstBase, uint dstP);
+
+void bmp8_hline(int x1, int y, int x2, u32 clr, void *dstBase, uint dstP);
+void bmp8_vline(int x, int y1, int y2, u32 clr, void *dstBase, uint dstP);
+void bmp8_line(int x1, int y1, int x2, int y2, u32 clr,
+ void *dstBase, uint dstP);
+
+INLINE void m4_hline(int x1, int y, int x2, u8 clrid);
+INLINE void m4_vline(int x, int y1, int y2, u8 clrid);
+INLINE void m4_line(int x1, int y1, int x2, int y2, u8 clrid);
+INLINE void m4_plot(int x, int y, u8 clrid);
+
+//! Plot a \a clrid pixel on the current mode 4 backbuffer
+INLINE void m4_plot(int x, int y, u8 clrid)
+{
+ u16 *dst= &vid_page[(y*M4_WIDTH+x)>>1];
+ if(x&1)
+ *dst= (*dst& 0xFF) | (clrid<<8);
+ else
+ *dst= (*dst&~0xFF) | clrid;
+}
+
+
+//! Draw a \a clrid colored horizontal line in mode 4.
+INLINE void m4_hline(int x1, int y, int x2, u8 clrid)
+{ bmp8_hline(x1, y, x2, clrid, vid_page, M4_WIDTH); }
+
+
+//! Draw a \a clrid colored vertical line in mode 4.
+INLINE void m4_vline(int x, int y1, int y2, u8 clrid)
+{ bmp8_vline(x, y1, y2, clrid, vid_page, M4_WIDTH); }
+
+
+//! Draw a \a clrid colored line in mode 4.
+INLINE void m4_line(int x1, int y1, int x2, int y2, u8 clrid)
+{ bmp8_line(x1, y1, x2, y2, clrid, vid_page, M4_WIDTH); }
+
+#endif //GBA_BITMAP_ENGINE_PROJECT_TONC_VIDEO_H
diff --git a/engine/include/libgba-sprite-engine/gba/toolbox.h b/engine/include/libgba-sprite-engine/gba/toolbox.h
new file mode 100644
index 0000000..e89b2ce
--- /dev/null
+++ b/engine/include/libgba-sprite-engine/gba/toolbox.h
@@ -0,0 +1,18 @@
+//
+// Created by Wouter Groeneveld on 10/07/20.
+//
+
+#ifndef GBA_BITMAP_ENGINE_PROJECT_TOOLBOX_H
+#define GBA_BITMAP_ENGINE_PROJECT_TOOLBOX_H
+
+#define GBA_SCREEN_WIDTH 240
+#define GBA_SCREEN_WIDTH_FX GBA_SCREEN_WIDTH << 8
+#define GBA_SCREEN_HEIGHT 160
+#define GBA_SCREEN_HEIGHT_FX GBA_SCREEN_HEIGHT << 8
+
+#define M4_WIDTH GBA_SCREEN_WIDTH
+#define M4_HEIGHT GBA_SCREEN_HEIGHT
+
+extern COLOR *vid_page;
+
+#endif //GBA_BITMAP_ENGINE_PROJECT_TOOLBOX_H
diff --git a/engine/include/libgba-sprite-engine/gba_engine.h b/engine/include/libgba-sprite-engine/gba_engine.h
index 7858899..468d613 100644
--- a/engine/include/libgba-sprite-engine/gba_engine.h
+++ b/engine/include/libgba-sprite-engine/gba_engine.h
@@ -15,16 +15,8 @@
#include "sound_control.h"
#include "timer.h"
-#define GBA_SCREEN_WIDTH 240
-#define GBA_SCREEN_WIDTH_FX GBA_SCREEN_WIDTH << 8
-#define GBA_SCREEN_HEIGHT 160
-#define GBA_SCREEN_HEIGHT_FX GBA_SCREEN_HEIGHT << 8
-
-
-#define M4_WIDTH 240
const unsigned int black[VRAM_PAGE_SIZE] = {};
-extern u16 *vid_page;
class GBAEngine {
private:
@@ -74,6 +66,7 @@ public:
for(int i = 0; i < times; i++){}
}
+ inline void plotPixel(int x, int y, u8 clrId);
inline void plotPixel(const VectorPx &pixel, u8 clrId);
inline void plotLine(const VectorPx &point0, const VectorPx &point1, u8 clrId);
};
diff --git a/engine/src/gba/tonc_bmp8.cpp b/engine/src/gba/tonc_bmp8.cpp
new file mode 100644
index 0000000..4daaa07
--- /dev/null
+++ b/engine/src/gba/tonc_bmp8.cpp
@@ -0,0 +1,206 @@
+//
+// Created by Wouter Groeneveld on 10/07/20.
+//
+
+//! Plot a single pixel on a 8-bit buffer
+#include
+#include
+#include
+
+/*!
+ \param x X-coord.
+ \param y Y-coord.
+ \param clr Color.
+ \param dstBase Canvas pointer (halfword-aligned plz).
+ \param dstP Canvas pitch in bytes.
+ \note Slow as fuck. Inline plotting functionality if possible.
+*/
+void bmp8_plot(int x, int y, u32 clr, void *dstBase, uint dstP)
+{
+ u16 *dstD= (u16*)(dstBase+y*dstP+(x&~1));
+
+ if(x&1)
+ *dstD= (*dstD& 0xFF) | (clr<<8);
+ else
+ *dstD= (*dstD&~0xFF) | (clr&0xFF);
+}
+
+
+//! Draw a horizontal line on an 8bit buffer
+/*!
+ \param x1 First X-coord.
+ \param y Y-coord.
+ \param x2 Second X-coord.
+ \param clr Color index.
+ \param dstBase Canvas pointer (halfword-aligned plz).
+ \param dstP canvas pitch in bytes.
+ \note Does normalization, but not bounds checks.
+*/
+void bmp8_hline(int x1, int y, int x2, u32 clr, void *dstBase, uint dstP)
+{
+ // --- Normalize ---
+ clr &= 0xFF;
+ if(x2x2)
+ { xstep= -1; dx= x1-x2; }
+ else
+ { xstep= +1; dx= x2-x1; }
+
+ if(y1>y2)
+ { ystep= -dstP; dy= y1-y2; }
+ else
+ { ystep= +dstP; dy= y2-y1; }
+
+
+ // --- Drawing ---
+ // NOTE: because xstep is alternating, you can do marvels
+ // with mask-flips
+ // NOTE: (mask>>31) is equivalent to (x&1) ? 0 : 1
+
+ if(dx>=dy) // Diagonal, slope <= 1
+ {
+ dd= 2*dy - dx;
+
+ for(ii=dx; ii>=0; ii--)
+ {
+ dstL= (u16*)(addr - (mask>>31));
+ *dstL= (*dstL &~ mask) | (clr & mask);
+
+ if(dd >= 0)
+ { dd -= 2*dx; addr += ystep; }
+
+ dd += 2*dy;
+ addr += xstep;
+ mask = ~mask;
+ }
+ }
+ else // # Diagonal, slope > 1
+ {
+ dd= 2*dx - dy;
+
+ for(ii=dy; ii>=0; ii--)
+ {
+ dstL= (u16*)(addr - (mask>>31));
+ *dstL= (*dstL &~ mask) | (clr & mask);
+
+ if(dd >= 0)
+ {
+ dd -= 2*dy;
+ addr += xstep;
+ mask = ~mask;
+ }
+
+ dd += 2*dx;
+ addr += ystep;
+ }
+ }
+}
+
+INLINE void m4_line(int x1, int y1, int x2, int y2, u8 clrid)
+{ bmp8_line(x1, y1, x2, y2, clrid, vid_page, M4_WIDTH); }
diff --git a/engine/src/gba/tonc_memset.s b/engine/src/gba/tonc_memset.s
new file mode 100644
index 0000000..86f4354
--- /dev/null
+++ b/engine/src/gba/tonc_memset.s
@@ -0,0 +1,216 @@
+//
+// Alignment-safe and fast memset routines
+//
+//! \file tonc_memcpy.s
+//! \author J Vijn
+//! \date 20060508 - 20090801
+//
+// === NOTES ===
+@ * 20050924: Lower overhead for all; reduced i-count for u16 loops.
+@ * These are 16/32bit memset and memcpy. The 32bit versions are in
+@ iwram for maximum effect and pretty much do what CpuFastSet does,
+@ except that it'll work for non multiples of 8 words too. Speed
+@ is as good as CpuFastSet, but with a little less overhead.
+@ * The 16bit versions call the 32bit ones if possible and/or desirable.
+@ They are thumb/ROM functions but did them in asm anyway because
+@ GCC goes haywire with the use of registers resulting in a much
+@ higher overhead (i.e., detrimental for low counts)
+@ * Crossover with inline while(nn--) loops (not for(ii++), which are
+@ much slower):
+@ memset32: ~5
+@ memset16: ~8
+
+ .file "tonc_memset.s"
+
+
+#define DEF_SIZE(_name) .size _name, .-_name
+
+//! \name Section definitions for assembly.
+//\{
+
+#define CSEC_TEXT .text //!< Standard code section directive.
+#define CSEC_EWRAM .section .ewram , "ax", %progbits //!< EWRAM code section directive.
+#define CSEC_IWRAM .section .iwram, "ax", %progbits //!< IWRAM code section directive.
+
+#define DSEC_DATA .data //must be word aligned.
+ \note \a r0 returns as \a dst + \a wdn.
+*/
+/* Reglist:
+ r0, r1: dst, src
+ r2: wdn, then wdn>>3
+ r3-r10: data buffer
+ r12: wdn&7
+*/
+BEGIN_FUNC_ARM(memset32, CSEC_IWRAM)
+ and r12, r2, #7
+ movs r2, r2, lsr #3
+ beq .Lres_set32
+ push {r4-r9}
+ @ set 32byte chunks with 8fold xxmia
+ mov r3, r1
+ mov r4, r1
+ mov r5, r1
+ mov r6, r1
+ mov r7, r1
+ mov r8, r1
+ mov r9, r1
+.Lmain_set32:
+ stmia r0!, {r1, r3-r9}
+ subs r2, r2, #1
+ bhi .Lmain_set32
+ pop {r4-r9}
+ @ residual 0-7 words
+.Lres_set32:
+ subs r12, r12, #1
+ stmhsia r0!, {r1}
+ bhi .Lres_set32
+ bx lr
+END_FUNC(memset32)
+
+@ === void memset16(void *dst, u16 src, u32 hwn); =====================
+/*! \fn void memset16(void *dst, u16 src, u32 hwn);
+ \brief Fill for halfwords.
+ Uses memset32()
if \a hwn>5
+ \param dst Destination address.
+ \param src Source halfword (not address).
+ \param wdn Number of halfwords to fill.
+ \note \a dst must be halfword aligned.
+ \note \a r0 returns as \a dst + \a hwn.
+*/
+/* Reglist:
+ r0, r1: dst, src
+ r2, r4: wdn
+ r3: tmp; and data buffer
+*/
+BEGIN_FUNC_THUMB(memset16, CSEC_TEXT)
+ push {r4, lr}
+ @ under 6 hwords -> std set
+ cmp r2, #5
+ bls .Ltail_set16
+ @ dst not word aligned: copy 1 hword and align
+ lsl r3, r0, #31
+ bcc .Lmain_set16
+ strh r1, [r0]
+ add r0, #2
+ sub r2, r2, #1
+ @ Again, memset32 does the real work
+.Lmain_set16:
+ lsl r4, r1, #16
+ orr r1, r4
+ lsl r4, r2, #31
+ lsr r2, r2, #1
+ ldr r3, =memset32
+ bl .Llong_bl
+ @ NOTE: r0 is altered by memset32, but in exactly the right
+ @ way, so we can use is as is. r1 is now doubled though.
+ lsr r2, r4, #31
+ beq .Lend_set16
+ lsr r1, #16
+.Ltail_set16:
+ sub r2, #1
+ bcc .Lend_set16 @ r2 was 0, bug out
+ lsl r2, r2, #1
+.Lres_set16:
+ strh r1, [r0, r2]
+ sub r2, r2, #2
+ bcs .Lres_set16
+.Lend_set16:
+ pop {r4}
+ pop {r3}
+.Llong_bl:
+ bx r3
+END_FUNC(memset16)
+
+
+@ EOF
diff --git a/engine/src/gba_engine.cpp b/engine/src/gba_engine.cpp
index c162b6c..be7030a 100644
--- a/engine/src/gba_engine.cpp
+++ b/engine/src/gba_engine.cpp
@@ -3,6 +3,7 @@
//
#include
+#include
#include
#include
#include
@@ -145,42 +146,15 @@ void GBAEngine::flipPage() {
REG_DISPCNT ^= DCNT_PAGE;
}
-// http://www.coranac.com/tonc/text/bitmaps.htm
-// this thing is supposed to be very slow. see link above.
inline void GBAEngine::plotPixel(const VectorPx &pixel, u8 clrId) {
- u16 *dst = &vid_page[(pixel.y() * M4_WIDTH + pixel.x()) / 2];
- if(pixel.x() & 1) {
- *dst = (*dst & 0xFF) | (clrId << 8);
- } else {
- *dst = (*dst & ~0xFF) | clrId;
- }
+ m4_plot(pixel.x(), pixel.y(), clrId);
}
-// more or less 1-to-1:
-// https://www.davrous.com/2013/06/14/tutorial-part-2-learning-how-to-write-a-3d-soft-engine-from-scratch-in-c-ts-or-js-drawing-lines-triangles/
inline void GBAEngine::plotLine(const VectorPx &point0, const VectorPx &point1, u8 clrId) {
- int x0 = point0.x();
- int y0 = point0.y();
- int x1 = point1.x();
- int y1 = point1.y();
-
- int dx = ABS(x1 - x0);
- int dy = ABS(y1 - y0);
- int sx = (x0 < x1) ? 1 : -1;
- int sy = (y0 < y1) ? 1 : -1;
- int err = dx - dy;
-
- while (true) {
- plotPixel(VectorPx(x0, y0), clrId);
-
- if ((x0 == x1) && (y0 == y1)) break;
- auto e2 = 2 * err;
- if (e2 > -dy) { err -= dy; x0 += sx; }
- if (e2 < dx) { err += dx; y0 += sy; }
- }
+ // uses tonc's optimalization tricks to get 10 FPS extra compared to standard bline algorithms
+ m4_line(point0.x(), point0.y(), point1.x(), point1.y(), clrId);
}
-
inline VectorPx GBAEngine::project(const VectorFx &coord, const MatrixFx &transMat) {
auto point = MatrixFx::transformCoordinates(coord, transMat);