xref: /openbsd-src/sys/lib/libkern/arch/arm/memcpy.S (revision c6b709f57b9f0eb79d5cd435ebd6ead1f648a81f)
1*c6b709f5Sjsg/*	$OpenBSD: memcpy.S,v 1.6 2015/06/08 14:22:05 jsg Exp $	*/
27c0511a1Sdrahn/*	$NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $	*/
37c0511a1Sdrahn
47c0511a1Sdrahn/*-
57c0511a1Sdrahn * Copyright (c) 1997 The NetBSD Foundation, Inc.
67c0511a1Sdrahn * All rights reserved.
77c0511a1Sdrahn *
87c0511a1Sdrahn * This code is derived from software contributed to The NetBSD Foundation
97c0511a1Sdrahn * by Neil A. Carson and Mark Brinicombe
107c0511a1Sdrahn *
117c0511a1Sdrahn * Redistribution and use in source and binary forms, with or without
127c0511a1Sdrahn * modification, are permitted provided that the following conditions
137c0511a1Sdrahn * are met:
147c0511a1Sdrahn * 1. Redistributions of source code must retain the above copyright
157c0511a1Sdrahn *    notice, this list of conditions and the following disclaimer.
167c0511a1Sdrahn * 2. Redistributions in binary form must reproduce the above copyright
177c0511a1Sdrahn *    notice, this list of conditions and the following disclaimer in the
187c0511a1Sdrahn *    documentation and/or other materials provided with the distribution.
197c0511a1Sdrahn *
207c0511a1Sdrahn * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
217c0511a1Sdrahn * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
227c0511a1Sdrahn * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
237c0511a1Sdrahn * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
247c0511a1Sdrahn * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
257c0511a1Sdrahn * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
267c0511a1Sdrahn * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
277c0511a1Sdrahn * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
287c0511a1Sdrahn * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
297c0511a1Sdrahn * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
307c0511a1Sdrahn * POSSIBILITY OF SUCH DAMAGE.
317c0511a1Sdrahn */
327c0511a1Sdrahn
337c0511a1Sdrahn#include <machine/asm.h>
347c0511a1Sdrahn
357c0511a1Sdrahn/*
367c0511a1Sdrahn * This is one fun bit of code ...
377c0511a1Sdrahn * Some easy listening music is suggested while trying to understand this
387c0511a1Sdrahn * code e.g. Iron Maiden
397c0511a1Sdrahn *
407c0511a1Sdrahn * For anyone attempting to understand it :
417c0511a1Sdrahn *
427c0511a1Sdrahn * The core code is implemented here with simple stubs for memcpy()
437c0511a1Sdrahn * memmove() and bcopy().
447c0511a1Sdrahn *
457c0511a1Sdrahn * All local labels are prefixed with Lmemcpy_
467c0511a1Sdrahn * Following the prefix a label starting f is used in the forward copy code
477c0511a1Sdrahn * while a label using b is used in the backwards copy code
487c0511a1Sdrahn * The source and destination addresses determine whether a forward or
497c0511a1Sdrahn * backward copy is performed.
507c0511a1Sdrahn * Separate bits of code are used to deal with the following situations
517c0511a1Sdrahn * for both the forward and backwards copy.
527c0511a1Sdrahn * unaligned source address
537c0511a1Sdrahn * unaligned destination address
547c0511a1Sdrahn * Separate copy routines are used to produce an optimised result for each
557c0511a1Sdrahn * of these cases.
567c0511a1Sdrahn * The copy code will use LDM/STM instructions to copy up to 32 bytes at
577c0511a1Sdrahn * a time where possible.
587c0511a1Sdrahn *
597c0511a1Sdrahn * Note: r12 (aka ip) can be trashed during the function along with
607c0511a1Sdrahn * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
617c0511a1Sdrahn * Additional registers are preserved prior to use i.e. r4, r5 & lr
627c0511a1Sdrahn *
637c0511a1Sdrahn * Apologies for the state of the comments ;-)
647c0511a1Sdrahn */
657c0511a1Sdrahn
66*c6b709f5Sjsg.syntax unified
67*c6b709f5Sjsg
687c0511a1SdrahnENTRY(memcpy)
697c0511a1SdrahnENTRY_NP(memmove)
707c0511a1Sdrahn	/* Determine copy direction */
717c0511a1Sdrahn	cmp	r1, r0
727c0511a1Sdrahn
737c0511a1Sdrahn	moveq	pc, lr
747c0511a1Sdrahn
757c0511a1Sdrahn	/* save leaf functions having to store this away */
767c0511a1Sdrahn	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
777c0511a1Sdrahn
787c0511a1Sdrahn	bcc	Lmemcpy_backwards
797c0511a1Sdrahn
807c0511a1Sdrahn	/* start of forwards copy */
817c0511a1Sdrahn	subs	r2, r2, #4
827c0511a1Sdrahn	blt	Lmemcpy_fl4		/* less than 4 bytes */
837c0511a1Sdrahn	ands	r12, r0, #3
847c0511a1Sdrahn	bne	Lmemcpy_fdestul		/* oh unaligned destination addr */
857c0511a1Sdrahn	ands	r12, r1, #3
867c0511a1Sdrahn	bne	Lmemcpy_fsrcul		/* oh unaligned source addr */
877c0511a1Sdrahn
887c0511a1SdrahnLmemcpy_ft8:
897c0511a1Sdrahn	/* We have aligned source and destination */
907c0511a1Sdrahn	subs	r2, r2, #8
917c0511a1Sdrahn	blt	Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
927c0511a1Sdrahn	subs	r2, r2, #0x14
937c0511a1Sdrahn	blt	Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
947c0511a1Sdrahn	stmdb	sp!, {r4}		/* borrow r4 */
957c0511a1Sdrahn
967c0511a1Sdrahn	/* blat 32 bytes at a time */
977c0511a1Sdrahn	/* XXX for really big copies perhaps we should use more registers */
987c0511a1SdrahnLmemcpy_floop32:
997c0511a1Sdrahn	ldmia	r1!, {r3, r4, r12, lr}
1007c0511a1Sdrahn	stmia	r0!, {r3, r4, r12, lr}
1017c0511a1Sdrahn	ldmia	r1!, {r3, r4, r12, lr}
1027c0511a1Sdrahn	stmia	r0!, {r3, r4, r12, lr}
1037c0511a1Sdrahn	subs	r2, r2, #0x20
1047c0511a1Sdrahn	bge	Lmemcpy_floop32
1057c0511a1Sdrahn
1067c0511a1Sdrahn	cmn	r2, #0x10
107*c6b709f5Sjsg	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
108*c6b709f5Sjsg	stmiage	r0!, {r3, r4, r12, lr}
1097c0511a1Sdrahn	subge	r2, r2, #0x10
1107c0511a1Sdrahn	ldmia	sp!, {r4}		/* return r4 */
1117c0511a1Sdrahn
1127c0511a1SdrahnLmemcpy_fl32:
1137c0511a1Sdrahn	adds	r2, r2, #0x14
1147c0511a1Sdrahn
1157c0511a1Sdrahn	/* blat 12 bytes at a time */
1167c0511a1SdrahnLmemcpy_floop12:
117*c6b709f5Sjsg	ldmiage	r1!, {r3, r12, lr}
118*c6b709f5Sjsg	stmiage	r0!, {r3, r12, lr}
119*c6b709f5Sjsg	subsge	r2, r2, #0x0c
1207c0511a1Sdrahn	bge	Lmemcpy_floop12
1217c0511a1Sdrahn
1227c0511a1SdrahnLmemcpy_fl12:
1237c0511a1Sdrahn	adds	r2, r2, #8
1247c0511a1Sdrahn	blt	Lmemcpy_fl4
1257c0511a1Sdrahn
1267c0511a1Sdrahn	subs	r2, r2, #4
1277c0511a1Sdrahn	ldrlt	r3, [r1], #4
1287c0511a1Sdrahn	strlt	r3, [r0], #4
129*c6b709f5Sjsg	ldmiage	r1!, {r3, r12}
130*c6b709f5Sjsg	stmiage	r0!, {r3, r12}
1317c0511a1Sdrahn	subge	r2, r2, #4
1327c0511a1Sdrahn
1337c0511a1SdrahnLmemcpy_fl4:
1347c0511a1Sdrahn	/* less than 4 bytes to go */
1357c0511a1Sdrahn	adds	r2, r2, #4
1367c0511a1Sdrahn#ifdef __APCS_26_
137*c6b709f5Sjsg	ldmiaeq	sp!, {r0, pc}^		/* done */
1387c0511a1Sdrahn#else
139*c6b709f5Sjsg	ldmiaeq	sp!, {r0, pc}		/* done */
1407c0511a1Sdrahn#endif
1417c0511a1Sdrahn	/* copy the crud byte at a time */
1427c0511a1Sdrahn	cmp	r2, #2
1437c0511a1Sdrahn	ldrb	r3, [r1], #1
1447c0511a1Sdrahn	strb	r3, [r0], #1
145*c6b709f5Sjsg	ldrbge	r3, [r1], #1
146*c6b709f5Sjsg	strbge	r3, [r0], #1
147*c6b709f5Sjsg	ldrbgt	r3, [r1], #1
148*c6b709f5Sjsg	strbgt	r3, [r0], #1
1497c0511a1Sdrahn	ldmia	sp!, {r0, pc}
1507c0511a1Sdrahn
1517c0511a1Sdrahn	/* erg - unaligned destination */
1527c0511a1SdrahnLmemcpy_fdestul:
1537c0511a1Sdrahn	rsb	r12, r12, #4
1547c0511a1Sdrahn	cmp	r12, #2
1557c0511a1Sdrahn
1567c0511a1Sdrahn	/* align destination with byte copies */
1577c0511a1Sdrahn	ldrb	r3, [r1], #1
1587c0511a1Sdrahn	strb	r3, [r0], #1
159*c6b709f5Sjsg	ldrbge	r3, [r1], #1
160*c6b709f5Sjsg	strbge	r3, [r0], #1
161*c6b709f5Sjsg	ldrbgt	r3, [r1], #1
162*c6b709f5Sjsg	strbgt	r3, [r0], #1
1637c0511a1Sdrahn	subs	r2, r2, r12
1647c0511a1Sdrahn	blt	Lmemcpy_fl4		/* less the 4 bytes */
1657c0511a1Sdrahn
1667c0511a1Sdrahn	ands	r12, r1, #3
1677c0511a1Sdrahn	beq	Lmemcpy_ft8		/* we have an aligned source */
1687c0511a1Sdrahn
1697c0511a1Sdrahn	/* erg - unaligned source */
1707c0511a1Sdrahn	/* This is where it gets nasty ... */
1717c0511a1SdrahnLmemcpy_fsrcul:
1727c0511a1Sdrahn	bic	r1, r1, #3
1737c0511a1Sdrahn	ldr	lr, [r1], #4
1747c0511a1Sdrahn	cmp	r12, #2
1757c0511a1Sdrahn	bgt	Lmemcpy_fsrcul3
1767c0511a1Sdrahn	beq	Lmemcpy_fsrcul2
1777c0511a1Sdrahn	cmp	r2, #0x0c
1787c0511a1Sdrahn	blt	Lmemcpy_fsrcul1loop4
1797c0511a1Sdrahn	sub	r2, r2, #0x0c
1807c0511a1Sdrahn	stmdb	sp!, {r4, r5}
1817c0511a1Sdrahn
1827c0511a1SdrahnLmemcpy_fsrcul1loop16:
1837c0511a1Sdrahn	mov	r3, lr, lsr #8
1847c0511a1Sdrahn	ldmia	r1!, {r4, r5, r12, lr}
1857c0511a1Sdrahn	orr	r3, r3, r4, lsl #24
1867c0511a1Sdrahn	mov	r4, r4, lsr #8
1877c0511a1Sdrahn	orr	r4, r4, r5, lsl #24
1887c0511a1Sdrahn	mov	r5, r5, lsr #8
1897c0511a1Sdrahn	orr	r5, r5, r12, lsl #24
1907c0511a1Sdrahn	mov	r12, r12, lsr #8
1917c0511a1Sdrahn	orr	r12, r12, lr, lsl #24
1927c0511a1Sdrahn	stmia	r0!, {r3-r5, r12}
1937c0511a1Sdrahn	subs	r2, r2, #0x10
1947c0511a1Sdrahn	bge	Lmemcpy_fsrcul1loop16
1957c0511a1Sdrahn	ldmia	sp!, {r4, r5}
1967c0511a1Sdrahn	adds	r2, r2, #0x0c
1977c0511a1Sdrahn	blt	Lmemcpy_fsrcul1l4
1987c0511a1Sdrahn
1997c0511a1SdrahnLmemcpy_fsrcul1loop4:
2007c0511a1Sdrahn	mov	r12, lr, lsr #8
2017c0511a1Sdrahn	ldr	lr, [r1], #4
2027c0511a1Sdrahn	orr	r12, r12, lr, lsl #24
2037c0511a1Sdrahn	str	r12, [r0], #4
2047c0511a1Sdrahn	subs	r2, r2, #4
2057c0511a1Sdrahn	bge	Lmemcpy_fsrcul1loop4
2067c0511a1Sdrahn
2077c0511a1SdrahnLmemcpy_fsrcul1l4:
2087c0511a1Sdrahn	sub	r1, r1, #3
2097c0511a1Sdrahn	b	Lmemcpy_fl4
2107c0511a1Sdrahn
2117c0511a1SdrahnLmemcpy_fsrcul2:
2127c0511a1Sdrahn	cmp	r2, #0x0c
2137c0511a1Sdrahn	blt	Lmemcpy_fsrcul2loop4
2147c0511a1Sdrahn	sub	r2, r2, #0x0c
2157c0511a1Sdrahn	stmdb	sp!, {r4, r5}
2167c0511a1Sdrahn
2177c0511a1SdrahnLmemcpy_fsrcul2loop16:
2187c0511a1Sdrahn	mov	r3, lr, lsr #16
2197c0511a1Sdrahn	ldmia	r1!, {r4, r5, r12, lr}
2207c0511a1Sdrahn	orr	r3, r3, r4, lsl #16
2217c0511a1Sdrahn	mov	r4, r4, lsr #16
2227c0511a1Sdrahn	orr	r4, r4, r5, lsl #16
2237c0511a1Sdrahn	mov	r5, r5, lsr #16
2247c0511a1Sdrahn	orr	r5, r5, r12, lsl #16
2257c0511a1Sdrahn	mov	r12, r12, lsr #16
2267c0511a1Sdrahn	orr	r12, r12, lr, lsl #16
2277c0511a1Sdrahn	stmia	r0!, {r3-r5, r12}
2287c0511a1Sdrahn	subs	r2, r2, #0x10
2297c0511a1Sdrahn	bge	Lmemcpy_fsrcul2loop16
2307c0511a1Sdrahn	ldmia	sp!, {r4, r5}
2317c0511a1Sdrahn	adds	r2, r2, #0x0c
2327c0511a1Sdrahn	blt	Lmemcpy_fsrcul2l4
2337c0511a1Sdrahn
2347c0511a1SdrahnLmemcpy_fsrcul2loop4:
2357c0511a1Sdrahn	mov	r12, lr, lsr #16
2367c0511a1Sdrahn	ldr	lr, [r1], #4
2377c0511a1Sdrahn	orr	r12, r12, lr, lsl #16
2387c0511a1Sdrahn	str	r12, [r0], #4
2397c0511a1Sdrahn	subs	r2, r2, #4
2407c0511a1Sdrahn	bge	Lmemcpy_fsrcul2loop4
2417c0511a1Sdrahn
2427c0511a1SdrahnLmemcpy_fsrcul2l4:
2437c0511a1Sdrahn	sub	r1, r1, #2
2447c0511a1Sdrahn	b	Lmemcpy_fl4
2457c0511a1Sdrahn
2467c0511a1SdrahnLmemcpy_fsrcul3:
2477c0511a1Sdrahn	cmp	r2, #0x0c
2487c0511a1Sdrahn	blt	Lmemcpy_fsrcul3loop4
2497c0511a1Sdrahn	sub	r2, r2, #0x0c
2507c0511a1Sdrahn	stmdb	sp!, {r4, r5}
2517c0511a1Sdrahn
2527c0511a1SdrahnLmemcpy_fsrcul3loop16:
2537c0511a1Sdrahn	mov	r3, lr, lsr #24
2547c0511a1Sdrahn	ldmia	r1!, {r4, r5, r12, lr}
2557c0511a1Sdrahn	orr	r3, r3, r4, lsl #8
2567c0511a1Sdrahn	mov	r4, r4, lsr #24
2577c0511a1Sdrahn	orr	r4, r4, r5, lsl #8
2587c0511a1Sdrahn	mov	r5, r5, lsr #24
2597c0511a1Sdrahn	orr	r5, r5, r12, lsl #8
2607c0511a1Sdrahn	mov	r12, r12, lsr #24
2617c0511a1Sdrahn	orr	r12, r12, lr, lsl #8
2627c0511a1Sdrahn	stmia	r0!, {r3-r5, r12}
2637c0511a1Sdrahn	subs	r2, r2, #0x10
2647c0511a1Sdrahn	bge	Lmemcpy_fsrcul3loop16
2657c0511a1Sdrahn	ldmia	sp!, {r4, r5}
2667c0511a1Sdrahn	adds	r2, r2, #0x0c
2677c0511a1Sdrahn	blt	Lmemcpy_fsrcul3l4
2687c0511a1Sdrahn
2697c0511a1SdrahnLmemcpy_fsrcul3loop4:
2707c0511a1Sdrahn	mov	r12, lr, lsr #24
2717c0511a1Sdrahn	ldr	lr, [r1], #4
2727c0511a1Sdrahn	orr	r12, r12, lr, lsl #8
2737c0511a1Sdrahn	str	r12, [r0], #4
2747c0511a1Sdrahn	subs	r2, r2, #4
2757c0511a1Sdrahn	bge	Lmemcpy_fsrcul3loop4
2767c0511a1Sdrahn
2777c0511a1SdrahnLmemcpy_fsrcul3l4:
2787c0511a1Sdrahn	sub	r1, r1, #1
2797c0511a1Sdrahn	b	Lmemcpy_fl4
2807c0511a1Sdrahn
2817c0511a1SdrahnLmemcpy_backwards:
2827c0511a1Sdrahn	add	r1, r1, r2
2837c0511a1Sdrahn	add	r0, r0, r2
2847c0511a1Sdrahn	subs	r2, r2, #4
2857c0511a1Sdrahn	blt	Lmemcpy_bl4		/* less than 4 bytes */
2867c0511a1Sdrahn	ands	r12, r0, #3
2877c0511a1Sdrahn	bne	Lmemcpy_bdestul		/* oh unaligned destination addr */
2887c0511a1Sdrahn	ands	r12, r1, #3
2897c0511a1Sdrahn	bne	Lmemcpy_bsrcul		/* oh unaligned source addr */
2907c0511a1Sdrahn
2917c0511a1SdrahnLmemcpy_bt8:
2927c0511a1Sdrahn	/* We have aligned source and destination */
2937c0511a1Sdrahn	subs	r2, r2, #8
2947c0511a1Sdrahn	blt	Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
2957c0511a1Sdrahn	stmdb	sp!, {r4}
2967c0511a1Sdrahn	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
2977c0511a1Sdrahn	blt	Lmemcpy_bl32
2987c0511a1Sdrahn
2997c0511a1Sdrahn	/* blat 32 bytes at a time */
3007c0511a1Sdrahn	/* XXX for really big copies perhaps we should use more registers */
3017c0511a1SdrahnLmemcpy_bloop32:
3027c0511a1Sdrahn	ldmdb	r1!, {r3, r4, r12, lr}
3037c0511a1Sdrahn	stmdb	r0!, {r3, r4, r12, lr}
3047c0511a1Sdrahn	ldmdb	r1!, {r3, r4, r12, lr}
3057c0511a1Sdrahn	stmdb	r0!, {r3, r4, r12, lr}
3067c0511a1Sdrahn	subs	r2, r2, #0x20
3077c0511a1Sdrahn	bge	Lmemcpy_bloop32
3087c0511a1Sdrahn
3097c0511a1SdrahnLmemcpy_bl32:
3107c0511a1Sdrahn	cmn	r2, #0x10
311*c6b709f5Sjsg	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
312*c6b709f5Sjsg	stmdbge	r0!, {r3, r4, r12, lr}
3137c0511a1Sdrahn	subge	r2, r2, #0x10
3147c0511a1Sdrahn	adds	r2, r2, #0x14
315*c6b709f5Sjsg	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
316*c6b709f5Sjsg	stmdbge	r0!, {r3, r12, lr}
3177c0511a1Sdrahn	subge	r2, r2, #0x0c
3187c0511a1Sdrahn	ldmia	sp!, {r4}
3197c0511a1Sdrahn
3207c0511a1SdrahnLmemcpy_bl12:
3217c0511a1Sdrahn	adds	r2, r2, #8
3227c0511a1Sdrahn	blt	Lmemcpy_bl4
3237c0511a1Sdrahn	subs	r2, r2, #4
3247c0511a1Sdrahn	ldrlt	r3, [r1, #-4]!
3257c0511a1Sdrahn	strlt	r3, [r0, #-4]!
326*c6b709f5Sjsg	ldmdbge	r1!, {r3, r12}
327*c6b709f5Sjsg	stmdbge	r0!, {r3, r12}
3287c0511a1Sdrahn	subge	r2, r2, #4
3297c0511a1Sdrahn
3307c0511a1SdrahnLmemcpy_bl4:
3317c0511a1Sdrahn	/* less than 4 bytes to go */
3327c0511a1Sdrahn	adds	r2, r2, #4
333*c6b709f5Sjsg	ldmiaeq	sp!, {r0, pc}
3347c0511a1Sdrahn
3357c0511a1Sdrahn	/* copy the crud byte at a time */
3367c0511a1Sdrahn	cmp	r2, #2
3377c0511a1Sdrahn	ldrb	r3, [r1, #-1]!
3387c0511a1Sdrahn	strb	r3, [r0, #-1]!
339*c6b709f5Sjsg	ldrbge	r3, [r1, #-1]!
340*c6b709f5Sjsg	strbge	r3, [r0, #-1]!
341*c6b709f5Sjsg	ldrbgt	r3, [r1, #-1]!
342*c6b709f5Sjsg	strbgt	r3, [r0, #-1]!
3437c0511a1Sdrahn	ldmia	sp!, {r0, pc}
3447c0511a1Sdrahn
3457c0511a1Sdrahn	/* erg - unaligned destination */
3467c0511a1SdrahnLmemcpy_bdestul:
3477c0511a1Sdrahn	cmp	r12, #2
3487c0511a1Sdrahn
3497c0511a1Sdrahn	/* align destination with byte copies */
3507c0511a1Sdrahn	ldrb	r3, [r1, #-1]!
3517c0511a1Sdrahn	strb	r3, [r0, #-1]!
352*c6b709f5Sjsg	ldrbge	r3, [r1, #-1]!
353*c6b709f5Sjsg	strbge	r3, [r0, #-1]!
354*c6b709f5Sjsg	ldrbgt	r3, [r1, #-1]!
355*c6b709f5Sjsg	strbgt	r3, [r0, #-1]!
3567c0511a1Sdrahn	subs	r2, r2, r12
3577c0511a1Sdrahn	blt	Lmemcpy_bl4		/* less than 4 bytes to go */
3587c0511a1Sdrahn	ands	r12, r1, #3
3597c0511a1Sdrahn	beq	Lmemcpy_bt8		/* we have an aligned source */
3607c0511a1Sdrahn
3617c0511a1Sdrahn	/* erg - unaligned source */
3627c0511a1Sdrahn	/* This is where it gets nasty ... */
3637c0511a1SdrahnLmemcpy_bsrcul:
3647c0511a1Sdrahn	bic	r1, r1, #3
3657c0511a1Sdrahn	ldr	r3, [r1, #0]
3667c0511a1Sdrahn	cmp	r12, #2
3677c0511a1Sdrahn	blt	Lmemcpy_bsrcul1
3687c0511a1Sdrahn	beq	Lmemcpy_bsrcul2
3697c0511a1Sdrahn	cmp	r2, #0x0c
3707c0511a1Sdrahn	blt	Lmemcpy_bsrcul3loop4
3717c0511a1Sdrahn	sub	r2, r2, #0x0c
3727c0511a1Sdrahn	stmdb	sp!, {r4, r5}
3737c0511a1Sdrahn
3747c0511a1SdrahnLmemcpy_bsrcul3loop16:
3757c0511a1Sdrahn	mov	lr, r3, lsl #8
3767c0511a1Sdrahn	ldmdb	r1!, {r3-r5, r12}
3777c0511a1Sdrahn	orr	lr, lr, r12, lsr #24
3787c0511a1Sdrahn	mov	r12, r12, lsl #8
3797c0511a1Sdrahn	orr	r12, r12, r5, lsr #24
3807c0511a1Sdrahn	mov	r5, r5, lsl #8
3817c0511a1Sdrahn	orr	r5, r5, r4, lsr #24
3827c0511a1Sdrahn	mov	r4, r4, lsl #8
3837c0511a1Sdrahn	orr	r4, r4, r3, lsr #24
3847c0511a1Sdrahn	stmdb	r0!, {r4, r5, r12, lr}
3857c0511a1Sdrahn	subs	r2, r2, #0x10
3867c0511a1Sdrahn	bge	Lmemcpy_bsrcul3loop16
3877c0511a1Sdrahn	ldmia	sp!, {r4, r5}
3887c0511a1Sdrahn	adds	r2, r2, #0x0c
3897c0511a1Sdrahn	blt	Lmemcpy_bsrcul3l4
3907c0511a1Sdrahn
3917c0511a1SdrahnLmemcpy_bsrcul3loop4:
3927c0511a1Sdrahn	mov	r12, r3, lsl #8
3937c0511a1Sdrahn	ldr	r3, [r1, #-4]!
3947c0511a1Sdrahn	orr	r12, r12, r3, lsr #24
3957c0511a1Sdrahn	str	r12, [r0, #-4]!
3967c0511a1Sdrahn	subs	r2, r2, #4
3977c0511a1Sdrahn	bge	Lmemcpy_bsrcul3loop4
3987c0511a1Sdrahn
3997c0511a1SdrahnLmemcpy_bsrcul3l4:
4007c0511a1Sdrahn	add	r1, r1, #3
4017c0511a1Sdrahn	b	Lmemcpy_bl4
4027c0511a1Sdrahn
4037c0511a1SdrahnLmemcpy_bsrcul2:
4047c0511a1Sdrahn	cmp	r2, #0x0c
4057c0511a1Sdrahn	blt	Lmemcpy_bsrcul2loop4
4067c0511a1Sdrahn	sub	r2, r2, #0x0c
4077c0511a1Sdrahn	stmdb	sp!, {r4, r5}
4087c0511a1Sdrahn
4097c0511a1SdrahnLmemcpy_bsrcul2loop16:
4107c0511a1Sdrahn	mov	lr, r3, lsl #16
4117c0511a1Sdrahn	ldmdb	r1!, {r3-r5, r12}
4127c0511a1Sdrahn	orr	lr, lr, r12, lsr #16
4137c0511a1Sdrahn	mov	r12, r12, lsl #16
4147c0511a1Sdrahn	orr	r12, r12, r5, lsr #16
4157c0511a1Sdrahn	mov	r5, r5, lsl #16
4167c0511a1Sdrahn	orr	r5, r5, r4, lsr #16
4177c0511a1Sdrahn	mov	r4, r4, lsl #16
4187c0511a1Sdrahn	orr	r4, r4, r3, lsr #16
4197c0511a1Sdrahn	stmdb	r0!, {r4, r5, r12, lr}
4207c0511a1Sdrahn	subs	r2, r2, #0x10
4217c0511a1Sdrahn	bge	Lmemcpy_bsrcul2loop16
4227c0511a1Sdrahn	ldmia	sp!, {r4, r5}
4237c0511a1Sdrahn	adds	r2, r2, #0x0c
4247c0511a1Sdrahn	blt	Lmemcpy_bsrcul2l4
4257c0511a1Sdrahn
4267c0511a1SdrahnLmemcpy_bsrcul2loop4:
4277c0511a1Sdrahn	mov	r12, r3, lsl #16
4287c0511a1Sdrahn	ldr	r3, [r1, #-4]!
4297c0511a1Sdrahn	orr	r12, r12, r3, lsr #16
4307c0511a1Sdrahn	str	r12, [r0, #-4]!
4317c0511a1Sdrahn	subs	r2, r2, #4
4327c0511a1Sdrahn	bge	Lmemcpy_bsrcul2loop4
4337c0511a1Sdrahn
4347c0511a1SdrahnLmemcpy_bsrcul2l4:
4357c0511a1Sdrahn	add	r1, r1, #2
4367c0511a1Sdrahn	b	Lmemcpy_bl4
4377c0511a1Sdrahn
4387c0511a1SdrahnLmemcpy_bsrcul1:
4397c0511a1Sdrahn	cmp	r2, #0x0c
4407c0511a1Sdrahn	blt	Lmemcpy_bsrcul1loop4
4417c0511a1Sdrahn	sub	r2, r2, #0x0c
4427c0511a1Sdrahn	stmdb	sp!, {r4, r5}
4437c0511a1Sdrahn
4447c0511a1SdrahnLmemcpy_bsrcul1loop32:
4457c0511a1Sdrahn	mov	lr, r3, lsl #24
4467c0511a1Sdrahn	ldmdb	r1!, {r3-r5, r12}
4477c0511a1Sdrahn	orr	lr, lr, r12, lsr #8
4487c0511a1Sdrahn	mov	r12, r12, lsl #24
4497c0511a1Sdrahn	orr	r12, r12, r5, lsr #8
4507c0511a1Sdrahn	mov	r5, r5, lsl #24
4517c0511a1Sdrahn	orr	r5, r5, r4, lsr #8
4527c0511a1Sdrahn	mov	r4, r4, lsl #24
4537c0511a1Sdrahn	orr	r4, r4, r3, lsr #8
4547c0511a1Sdrahn	stmdb	r0!, {r4, r5, r12, lr}
4557c0511a1Sdrahn	subs	r2, r2, #0x10
4567c0511a1Sdrahn	bge	Lmemcpy_bsrcul1loop32
4577c0511a1Sdrahn	ldmia	sp!, {r4, r5}
4587c0511a1Sdrahn	adds	r2, r2, #0x0c
4597c0511a1Sdrahn	blt	Lmemcpy_bsrcul1l4
4607c0511a1Sdrahn
4617c0511a1SdrahnLmemcpy_bsrcul1loop4:
4627c0511a1Sdrahn	mov	r12, r3, lsl #24
4637c0511a1Sdrahn	ldr	r3, [r1, #-4]!
4647c0511a1Sdrahn	orr	r12, r12, r3, lsr #8
4657c0511a1Sdrahn	str	r12, [r0, #-4]!
4667c0511a1Sdrahn	subs	r2, r2, #4
4677c0511a1Sdrahn	bge	Lmemcpy_bsrcul1loop4
4687c0511a1Sdrahn
4697c0511a1SdrahnLmemcpy_bsrcul1l4:
4707c0511a1Sdrahn	add	r1, r1, #1
4717c0511a1Sdrahn	b	Lmemcpy_bl4
4727c0511a1Sdrahn
473