mirror of
https://github.com/clockworkpi/PicoCalc.git
synced 2025-12-12 10:18:54 +01:00
62 lines
1.9 KiB
ArmAsm
62 lines
1.9 KiB
ArmAsm
.syntax unified
|
|
|
|
@ High-throughput block transfer using the FPU register set as a 128-byte
|
|
@ buffer.
|
|
@
|
|
@ This implementation exploits the IT Folding feature of the Cortex-M4.
|
|
@ An IT instruction following a 16-bit Thumb instruction, when both are
|
|
@ aligned into a 32-bit memory word, takes no additional cycles to issue.
|
|
@
|
|
@ Arguments:
|
|
@ r0 source address
|
|
@ r1 destination address
|
|
@ r2 number of words to transfer.
|
|
.section .time_critical,"ax",%progbits
|
|
.balign 4
|
|
.global _Z10copy_wordsPKmPmm
|
|
.thumb_func
|
|
_Z10copy_wordsPKmPmm:
|
|
@ Name our registers.
|
|
src .req r0
|
|
dst .req r1
|
|
count .req r2
|
|
@ The caller may have been using floating point.
|
|
@ Save the callee-save portion of the register file.
|
|
vpush {s16 - s31} @ 17
|
|
@ 'Warm up' the transfer engine, which wants to operate in units of
|
|
@ 128 bytes, by making smaller transfers until count a multiple of
|
|
@ Special-case the single word transfer; the macro below won't work.
|
|
lsrs.n count, #1 @ 1
|
|
itt cs @ 0 (aligned)
|
|
vldmcs.32 src!, {s0} @ 2
|
|
vstmcs.32 dst!, {s0} @ 2
|
|
@ Transfer n+1 words.
|
|
.macro XFER n @ 5 + 2*n
|
|
lsrs.n count, #1 @ 1
|
|
itt cs @ 0 (aligned)
|
|
vldmcs.32 src!, {s0 - s\n} @ 1+1+n
|
|
vstmcs.32 dst!, {s0 - s\n} @ 1+1+n
|
|
.endm
|
|
XFER 1 @ 7
|
|
XFER 3 @ 11
|
|
XFER 7 @ 19
|
|
XFER 15 @ 35
|
|
@ Handle the case where we've been asked to transfer <32 words.
|
|
@ In such a case, count will now be zero, and the Z flag will still
|
|
@ be set from the last XFER.
|
|
@
|
|
@ Force the branch to use a 32-bit instruction to preserve alignment
|
|
@ of the loop branch below; this saves a cycle per time that branch
|
|
@ is taken.
|
|
@
|
|
@ Note that the target of this branch (at 1 below) is also aligned,
|
|
@ saving a cycle on the rare escape path.
|
|
beq.w 1f @ 1 (n.t.)
|
|
@ All warmed up, transfer in units of 128 bytes.
|
|
0: vldm.32 src!, {s0 - s31} @ 33
|
|
vstm.32 dst!, {s0 - s31} @ 33
|
|
subs.n count, #1 @ 1
|
|
bne.n 0b @ ~3 (taken)
|
|
@ Restore FPU state.
|
|
1: vpop {s16 - s31} @ 17
|
|
bx lr @ 1-3??
|