// // Alignment-safe and fast memset routines // //! \file tonc_memcpy.s //! \author J Vijn //! \date 20060508 - 20090801 // // === NOTES === @ * 20050924: Lower overhead for all; reduced i-count for u16 loops. @ * These are 16/32bit memset and memcpy. The 32bit versions are in @ iwram for maximum effect and pretty much do what CpuFastSet does, @ except that it'll work for non multiples of 8 words too. Speed @ is as good as CpuFastSet, but with a little less overhead. @ * The 16bit versions call the 32bit ones if possible and/or desirable. @ They are thumb/ROM functions but did them in asm anyway because @ GCC goes haywire with the use of registers resulting in a much @ higher overhead (i.e., detrimental for low counts) @ * Crossover with inline while(nn--) loops (not for(ii++), which are @ much slower): @ memset32: ~5 @ memset16: ~8 .file "tonc_memset.s" #define DEF_SIZE(_name) .size _name, .-_name //! \name Section definitions for assembly. //\{ #define CSEC_TEXT .text //!< Standard code section directive. #define CSEC_EWRAM .section .ewram , "ax", %progbits //!< EWRAM code section directive. #define CSEC_IWRAM .section .iwram, "ax", %progbits //!< IWRAM code section directive. #define DSEC_DATA .data //must be word aligned. \note \a r0 returns as \a dst + \a wdn. */ /* Reglist: r0, r1: dst, src r2: wdn, then wdn>>3 r3-r10: data buffer r12: wdn&7 */ BEGIN_FUNC_ARM(memset32, CSEC_IWRAM) and r12, r2, #7 movs r2, r2, lsr #3 beq .Lres_set32 push {r4-r9} @ set 32byte chunks with 8fold xxmia mov r3, r1 mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 mov r8, r1 mov r9, r1 .Lmain_set32: stmia r0!, {r1, r3-r9} subs r2, r2, #1 bhi .Lmain_set32 pop {r4-r9} @ residual 0-7 words .Lres_set32: subs r12, r12, #1 stmhsia r0!, {r1} bhi .Lres_set32 bx lr END_FUNC(memset32) @ === void memset16(void *dst, u16 src, u32 hwn); ===================== /*! \fn void memset16(void *dst, u16 src, u32 hwn); \brief Fill for halfwords. Uses memset32() if \a hwn>5 \param dst Destination address. \param src Source halfword (not address). \param wdn Number of halfwords to fill. \note \a dst must be halfword aligned. \note \a r0 returns as \a dst + \a hwn. */ /* Reglist: r0, r1: dst, src r2, r4: wdn r3: tmp; and data buffer */ BEGIN_FUNC_THUMB(memset16, CSEC_TEXT) push {r4, lr} @ under 6 hwords -> std set cmp r2, #5 bls .Ltail_set16 @ dst not word aligned: copy 1 hword and align lsl r3, r0, #31 bcc .Lmain_set16 strh r1, [r0] add r0, #2 sub r2, r2, #1 @ Again, memset32 does the real work .Lmain_set16: lsl r4, r1, #16 orr r1, r4 lsl r4, r2, #31 lsr r2, r2, #1 ldr r3, =memset32 bl .Llong_bl @ NOTE: r0 is altered by memset32, but in exactly the right @ way, so we can use is as is. r1 is now doubled though. lsr r2, r4, #31 beq .Lend_set16 lsr r1, #16 .Ltail_set16: sub r2, #1 bcc .Lend_set16 @ r2 was 0, bug out lsl r2, r2, #1 .Lres_set16: strh r1, [r0, r2] sub r2, r2, #2 bcs .Lres_set16 .Lend_set16: pop {r4} pop {r3} .Llong_bl: bx r3 END_FUNC(memset16) @ EOF