xref: /openbsd-src/lib/libc/arch/arm/string/_memcpy.S (revision 8ead0783a05eee83ab02af2c7b14b10fbcdce47d)
1*8ead0783Sguenther/*	$OpenBSD: _memcpy.S,v 1.7 2017/10/29 02:21:33 guenther Exp $	*/
2d987040fSdrahn/*	$NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $	*/
3d987040fSdrahn
4d987040fSdrahn/*-
5d987040fSdrahn * Copyright (c) 1997 The NetBSD Foundation, Inc.
6d987040fSdrahn * All rights reserved.
7d987040fSdrahn *
8d987040fSdrahn * This code is derived from software contributed to The NetBSD Foundation
9d987040fSdrahn * by Neil A. Carson and Mark Brinicombe
10d987040fSdrahn *
11d987040fSdrahn * Redistribution and use in source and binary forms, with or without
12d987040fSdrahn * modification, are permitted provided that the following conditions
13d987040fSdrahn * are met:
14d987040fSdrahn * 1. Redistributions of source code must retain the above copyright
15d987040fSdrahn *    notice, this list of conditions and the following disclaimer.
16d987040fSdrahn * 2. Redistributions in binary form must reproduce the above copyright
17d987040fSdrahn *    notice, this list of conditions and the following disclaimer in the
18d987040fSdrahn *    documentation and/or other materials provided with the distribution.
19d987040fSdrahn *
20d987040fSdrahn * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21d987040fSdrahn * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22d987040fSdrahn * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23d987040fSdrahn * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24d987040fSdrahn * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25d987040fSdrahn * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26d987040fSdrahn * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27d987040fSdrahn * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28d987040fSdrahn * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29d987040fSdrahn * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30d987040fSdrahn * POSSIBILITY OF SUCH DAMAGE.
31d987040fSdrahn */
32d987040fSdrahn
3338848718Sguenther#include "DEFS.h"
34d987040fSdrahn
35d987040fSdrahn/*
36d987040fSdrahn * This is one fun bit of code ...
37d987040fSdrahn * Some easy listening music is suggested while trying to understand this
38d987040fSdrahn * code e.g. Iron Maiden
39d987040fSdrahn *
40d987040fSdrahn * For anyone attempting to understand it :
41d987040fSdrahn *
42d987040fSdrahn * The core code is implemented here with simple stubs for memcpy()
43d987040fSdrahn * memmove() and bcopy().
44d987040fSdrahn *
45d987040fSdrahn * All local labels are prefixed with Lmemcpy_
46d987040fSdrahn * Following the prefix a label starting f is used in the forward copy code
47d987040fSdrahn * while a label using b is used in the backwards copy code
48d987040fSdrahn * The source and destination addresses determine whether a forward or
49d987040fSdrahn * backward copy is performed.
50d987040fSdrahn * Separate bits of code are used to deal with the following situations
51d987040fSdrahn * for both the forward and backwards copy.
52d987040fSdrahn * unaligned source address
53d987040fSdrahn * unaligned destination address
54d987040fSdrahn * Separate copy routines are used to produce an optimised result for each
55d987040fSdrahn * of these cases.
56d987040fSdrahn * The copy code will use LDM/STM instructions to copy up to 32 bytes at
57d987040fSdrahn * a time where possible.
58d987040fSdrahn *
59d987040fSdrahn * Note: r12 (aka ip) can be trashed during the function along with
60d987040fSdrahn * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
61d987040fSdrahn * Additional registers are preserved prior to use i.e. r4, r5 & lr
62d987040fSdrahn *
63d987040fSdrahn * Apologies for the state of the comments ;-)
64d987040fSdrahn */
65d987040fSdrahn
66c6b709f5Sjsg.syntax unified
67c6b709f5Sjsg
68*8ead0783Sguenther.hidden _memcpy
69*8ead0783Sguenther
70d987040fSdrahnENTRY(_memcpy)
71d987040fSdrahn	/* Determine copy direction */
72d987040fSdrahn	cmp	r1, r0
73d987040fSdrahn	bcc	.Lmemcpy_backwards
74d987040fSdrahn
75d987040fSdrahn	moveq	r0, #0			/* Quick abort for len=0 */
76d987040fSdrahn	moveq	pc, lr
77d987040fSdrahn
78d987040fSdrahn	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
79d987040fSdrahn	subs	r2, r2, #4
80d987040fSdrahn	blt	.Lmemcpy_fl4		/* less than 4 bytes */
81d987040fSdrahn	ands	r12, r0, #3
82d987040fSdrahn	bne	.Lmemcpy_fdestul	/* oh unaligned destination addr */
83d987040fSdrahn	ands	r12, r1, #3
84d987040fSdrahn	bne	.Lmemcpy_fsrcul		/* oh unaligned source addr */
85d987040fSdrahn
86d987040fSdrahn.Lmemcpy_ft8:
87d987040fSdrahn	/* We have aligned source and destination */
88d987040fSdrahn	subs	r2, r2, #8
89d987040fSdrahn	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
90d987040fSdrahn	subs	r2, r2, #0x14
91d987040fSdrahn	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
92d987040fSdrahn	stmdb	sp!, {r4}		/* borrow r4 */
93d987040fSdrahn
94d987040fSdrahn	/* blat 32 bytes at a time */
95d987040fSdrahn	/* XXX for really big copies perhaps we should use more registers */
96d987040fSdrahn.Lmemcpy_floop32:
97d987040fSdrahn	ldmia	r1!, {r3, r4, r12, lr}
98d987040fSdrahn	stmia	r0!, {r3, r4, r12, lr}
99d987040fSdrahn	ldmia	r1!, {r3, r4, r12, lr}
100d987040fSdrahn	stmia	r0!, {r3, r4, r12, lr}
101d987040fSdrahn	subs	r2, r2, #0x20
102d987040fSdrahn	bge	.Lmemcpy_floop32
103d987040fSdrahn
104d987040fSdrahn	cmn	r2, #0x10
105c6b709f5Sjsg	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
106c6b709f5Sjsg	stmiage	r0!, {r3, r4, r12, lr}
107d987040fSdrahn	subge	r2, r2, #0x10
108d987040fSdrahn	ldmia	sp!, {r4}		/* return r4 */
109d987040fSdrahn
110d987040fSdrahn.Lmemcpy_fl32:
111d987040fSdrahn	adds	r2, r2, #0x14
112d987040fSdrahn
113d987040fSdrahn	/* blat 12 bytes at a time */
114d987040fSdrahn.Lmemcpy_floop12:
115c6b709f5Sjsg	ldmiage	r1!, {r3, r12, lr}
116c6b709f5Sjsg	stmiage	r0!, {r3, r12, lr}
117c6b709f5Sjsg	subsge	r2, r2, #0x0c
118d987040fSdrahn	bge	.Lmemcpy_floop12
119d987040fSdrahn
120d987040fSdrahn.Lmemcpy_fl12:
121d987040fSdrahn	adds	r2, r2, #8
122d987040fSdrahn	blt	.Lmemcpy_fl4
123d987040fSdrahn
124d987040fSdrahn	subs	r2, r2, #4
125d987040fSdrahn	ldrlt	r3, [r1], #4
126d987040fSdrahn	strlt	r3, [r0], #4
127c6b709f5Sjsg	ldmiage	r1!, {r3, r12}
128c6b709f5Sjsg	stmiage	r0!, {r3, r12}
129d987040fSdrahn	subge	r2, r2, #4
130d987040fSdrahn
131d987040fSdrahn.Lmemcpy_fl4:
132d987040fSdrahn	/* less than 4 bytes to go */
133d987040fSdrahn	adds	r2, r2, #4
134c6b709f5Sjsg	ldmiaeq	sp!, {r0, pc}		/* done */
135d987040fSdrahn
136d987040fSdrahn	/* copy the crud byte at a time */
137d987040fSdrahn	cmp	r2, #2
138d987040fSdrahn	ldrb	r3, [r1], #1
139d987040fSdrahn	strb	r3, [r0], #1
140c6b709f5Sjsg	ldrbge	r3, [r1], #1
141c6b709f5Sjsg	strbge	r3, [r0], #1
142c6b709f5Sjsg	ldrbgt	r3, [r1], #1
143c6b709f5Sjsg	strbgt	r3, [r0], #1
144d987040fSdrahn	ldmia	sp!, {r0, pc}
145d987040fSdrahn
146d987040fSdrahn	/* erg - unaligned destination */
147d987040fSdrahn.Lmemcpy_fdestul:
148d987040fSdrahn	rsb	r12, r12, #4
149d987040fSdrahn	cmp	r12, #2
150d987040fSdrahn
151d987040fSdrahn	/* align destination with byte copies */
152d987040fSdrahn	ldrb	r3, [r1], #1
153d987040fSdrahn	strb	r3, [r0], #1
154c6b709f5Sjsg	ldrbge	r3, [r1], #1
155c6b709f5Sjsg	strbge	r3, [r0], #1
156c6b709f5Sjsg	ldrbgt	r3, [r1], #1
157c6b709f5Sjsg	strbgt	r3, [r0], #1
158d987040fSdrahn	subs	r2, r2, r12
159d987040fSdrahn	blt	.Lmemcpy_fl4		/* less the 4 bytes */
160d987040fSdrahn
161d987040fSdrahn	ands	r12, r1, #3
162d987040fSdrahn	beq	.Lmemcpy_ft8		/* we have an aligned source */
163d987040fSdrahn
164d987040fSdrahn	/* erg - unaligned source */
165d987040fSdrahn	/* This is where it gets nasty ... */
166d987040fSdrahn.Lmemcpy_fsrcul:
167d987040fSdrahn	bic	r1, r1, #3
168d987040fSdrahn	ldr	lr, [r1], #4
169d987040fSdrahn	cmp	r12, #2
170d987040fSdrahn	bgt	.Lmemcpy_fsrcul3
171d987040fSdrahn	beq	.Lmemcpy_fsrcul2
172d987040fSdrahn	cmp	r2, #0x0c
173d987040fSdrahn	blt	.Lmemcpy_fsrcul1loop4
174d987040fSdrahn	sub	r2, r2, #0x0c
175d987040fSdrahn	stmdb	sp!, {r4, r5}
176d987040fSdrahn
177d987040fSdrahn.Lmemcpy_fsrcul1loop16:
178d987040fSdrahn	mov	r3, lr, lsr #8
179d987040fSdrahn	ldmia	r1!, {r4, r5, r12, lr}
180d987040fSdrahn	orr	r3, r3, r4, lsl #24
181d987040fSdrahn	mov	r4, r4, lsr #8
182d987040fSdrahn	orr	r4, r4, r5, lsl #24
183d987040fSdrahn	mov	r5, r5, lsr #8
184d987040fSdrahn	orr	r5, r5, r12, lsl #24
185d987040fSdrahn	mov	r12, r12, lsr #8
186d987040fSdrahn	orr	r12, r12, lr, lsl #24
187d987040fSdrahn	stmia	r0!, {r3-r5, r12}
188d987040fSdrahn	subs	r2, r2, #0x10
189d987040fSdrahn	bge	.Lmemcpy_fsrcul1loop16
190d987040fSdrahn	ldmia	sp!, {r4, r5}
191d987040fSdrahn	adds	r2, r2, #0x0c
192d987040fSdrahn	blt	.Lmemcpy_fsrcul1l4
193d987040fSdrahn
194d987040fSdrahn.Lmemcpy_fsrcul1loop4:
195d987040fSdrahn	mov	r12, lr, lsr #8
196d987040fSdrahn	ldr	lr, [r1], #4
197d987040fSdrahn	orr	r12, r12, lr, lsl #24
198d987040fSdrahn	str	r12, [r0], #4
199d987040fSdrahn	subs	r2, r2, #4
200d987040fSdrahn	bge	.Lmemcpy_fsrcul1loop4
201d987040fSdrahn
202d987040fSdrahn.Lmemcpy_fsrcul1l4:
203d987040fSdrahn	sub	r1, r1, #3
204d987040fSdrahn	b	.Lmemcpy_fl4
205d987040fSdrahn
206d987040fSdrahn.Lmemcpy_fsrcul2:
207d987040fSdrahn	cmp	r2, #0x0c
208d987040fSdrahn	blt	.Lmemcpy_fsrcul2loop4
209d987040fSdrahn	sub	r2, r2, #0x0c
210d987040fSdrahn	stmdb	sp!, {r4, r5}
211d987040fSdrahn
212d987040fSdrahn.Lmemcpy_fsrcul2loop16:
213d987040fSdrahn	mov	r3, lr, lsr #16
214d987040fSdrahn	ldmia	r1!, {r4, r5, r12, lr}
215d987040fSdrahn	orr	r3, r3, r4, lsl #16
216d987040fSdrahn	mov	r4, r4, lsr #16
217d987040fSdrahn	orr	r4, r4, r5, lsl #16
218d987040fSdrahn	mov	r5, r5, lsr #16
219d987040fSdrahn	orr	r5, r5, r12, lsl #16
220d987040fSdrahn	mov	r12, r12, lsr #16
221d987040fSdrahn	orr	r12, r12, lr, lsl #16
222d987040fSdrahn	stmia	r0!, {r3-r5, r12}
223d987040fSdrahn	subs	r2, r2, #0x10
224d987040fSdrahn	bge	.Lmemcpy_fsrcul2loop16
225d987040fSdrahn	ldmia	sp!, {r4, r5}
226d987040fSdrahn	adds	r2, r2, #0x0c
227d987040fSdrahn	blt	.Lmemcpy_fsrcul2l4
228d987040fSdrahn
229d987040fSdrahn.Lmemcpy_fsrcul2loop4:
230d987040fSdrahn	mov	r12, lr, lsr #16
231d987040fSdrahn	ldr	lr, [r1], #4
232d987040fSdrahn	orr	r12, r12, lr, lsl #16
233d987040fSdrahn	str	r12, [r0], #4
234d987040fSdrahn	subs	r2, r2, #4
235d987040fSdrahn	bge	.Lmemcpy_fsrcul2loop4
236d987040fSdrahn
237d987040fSdrahn.Lmemcpy_fsrcul2l4:
238d987040fSdrahn	sub	r1, r1, #2
239d987040fSdrahn	b	.Lmemcpy_fl4
240d987040fSdrahn
241d987040fSdrahn.Lmemcpy_fsrcul3:
242d987040fSdrahn	cmp	r2, #0x0c
243d987040fSdrahn	blt	.Lmemcpy_fsrcul3loop4
244d987040fSdrahn	sub	r2, r2, #0x0c
245d987040fSdrahn	stmdb	sp!, {r4, r5}
246d987040fSdrahn
247d987040fSdrahn.Lmemcpy_fsrcul3loop16:
248d987040fSdrahn	mov	r3, lr, lsr #24
249d987040fSdrahn	ldmia	r1!, {r4, r5, r12, lr}
250d987040fSdrahn	orr	r3, r3, r4, lsl #8
251d987040fSdrahn	mov	r4, r4, lsr #24
252d987040fSdrahn	orr	r4, r4, r5, lsl #8
253d987040fSdrahn	mov	r5, r5, lsr #24
254d987040fSdrahn	orr	r5, r5, r12, lsl #8
255d987040fSdrahn	mov	r12, r12, lsr #24
256d987040fSdrahn	orr	r12, r12, lr, lsl #8
257d987040fSdrahn	stmia	r0!, {r3-r5, r12}
258d987040fSdrahn	subs	r2, r2, #0x10
259d987040fSdrahn	bge	.Lmemcpy_fsrcul3loop16
260d987040fSdrahn	ldmia	sp!, {r4, r5}
261d987040fSdrahn	adds	r2, r2, #0x0c
262d987040fSdrahn	blt	.Lmemcpy_fsrcul3l4
263d987040fSdrahn
264d987040fSdrahn.Lmemcpy_fsrcul3loop4:
265d987040fSdrahn	mov	r12, lr, lsr #24
266d987040fSdrahn	ldr	lr, [r1], #4
267d987040fSdrahn	orr	r12, r12, lr, lsl #8
268d987040fSdrahn	str	r12, [r0], #4
269d987040fSdrahn	subs	r2, r2, #4
270d987040fSdrahn	bge	.Lmemcpy_fsrcul3loop4
271d987040fSdrahn
272d987040fSdrahn.Lmemcpy_fsrcul3l4:
273d987040fSdrahn	sub	r1, r1, #1
274d987040fSdrahn	b	.Lmemcpy_fl4
275d987040fSdrahn
276d987040fSdrahn.Lmemcpy_backwards:
277d987040fSdrahn	add	r1, r1, r2
278d987040fSdrahn	add	r0, r0, r2
279d987040fSdrahn	subs	r2, r2, #4
280d987040fSdrahn	blt	.Lmemcpy_bl4		/* less than 4 bytes */
281d987040fSdrahn	ands	r12, r0, #3
282d987040fSdrahn	bne	.Lmemcpy_bdestul	/* oh unaligned destination addr */
283d987040fSdrahn	ands	r12, r1, #3
284d987040fSdrahn	bne	.Lmemcpy_bsrcul		/* oh unaligned source addr */
285d987040fSdrahn
286d987040fSdrahn.Lmemcpy_bt8:
287d987040fSdrahn	/* We have aligned source and destination */
288d987040fSdrahn	subs	r2, r2, #8
289d987040fSdrahn	blt	.Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
290d987040fSdrahn	stmdb	sp!, {r4, lr}
291d987040fSdrahn	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
292d987040fSdrahn	blt	.Lmemcpy_bl32
293d987040fSdrahn
294d987040fSdrahn	/* blat 32 bytes at a time */
295d987040fSdrahn	/* XXX for really big copies perhaps we should use more registers */
296d987040fSdrahn.Lmemcpy_bloop32:
297d987040fSdrahn	ldmdb	r1!, {r3, r4, r12, lr}
298d987040fSdrahn	stmdb	r0!, {r3, r4, r12, lr}
299d987040fSdrahn	ldmdb	r1!, {r3, r4, r12, lr}
300d987040fSdrahn	stmdb	r0!, {r3, r4, r12, lr}
301d987040fSdrahn	subs	r2, r2, #0x20
302d987040fSdrahn	bge	.Lmemcpy_bloop32
303d987040fSdrahn
304d987040fSdrahn.Lmemcpy_bl32:
305d987040fSdrahn	cmn	r2, #0x10
306c6b709f5Sjsg	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
307c6b709f5Sjsg	stmdbge	r0!, {r3, r4, r12, lr}
308d987040fSdrahn	subge	r2, r2, #0x10
309d987040fSdrahn	adds	r2, r2, #0x14
310c6b709f5Sjsg	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
311c6b709f5Sjsg	stmdbge	r0!, {r3, r12, lr}
312d987040fSdrahn	subge	r2, r2, #0x0c
313d987040fSdrahn	ldmia	sp!, {r4, lr}
314d987040fSdrahn
315d987040fSdrahn.Lmemcpy_bl12:
316d987040fSdrahn	adds	r2, r2, #8
317d987040fSdrahn	blt	.Lmemcpy_bl4
318d987040fSdrahn	subs	r2, r2, #4
319d987040fSdrahn	ldrlt	r3, [r1, #-4]!
320d987040fSdrahn	strlt	r3, [r0, #-4]!
321c6b709f5Sjsg	ldmdbge	r1!, {r3, r12}
322c6b709f5Sjsg	stmdbge	r0!, {r3, r12}
323d987040fSdrahn	subge	r2, r2, #4
324d987040fSdrahn
325d987040fSdrahn.Lmemcpy_bl4:
326d987040fSdrahn	/* less than 4 bytes to go */
327d987040fSdrahn	adds	r2, r2, #4
328d987040fSdrahn	moveq	pc, lr			/* done */
329d987040fSdrahn
330d987040fSdrahn	/* copy the crud byte at a time */
331d987040fSdrahn	cmp	r2, #2
332d987040fSdrahn	ldrb	r3, [r1, #-1]!
333d987040fSdrahn	strb	r3, [r0, #-1]!
334c6b709f5Sjsg	ldrbge	r3, [r1, #-1]!
335c6b709f5Sjsg	strbge	r3, [r0, #-1]!
336c6b709f5Sjsg	ldrbgt	r3, [r1, #-1]!
337c6b709f5Sjsg	strbgt	r3, [r0, #-1]!
338d987040fSdrahn	mov	pc, lr
339d987040fSdrahn
340d987040fSdrahn	/* erg - unaligned destination */
341d987040fSdrahn.Lmemcpy_bdestul:
342d987040fSdrahn	cmp	r12, #2
343d987040fSdrahn
344d987040fSdrahn	/* align destination with byte copies */
345d987040fSdrahn	ldrb	r3, [r1, #-1]!
346d987040fSdrahn	strb	r3, [r0, #-1]!
347c6b709f5Sjsg	ldrbge	r3, [r1, #-1]!
348c6b709f5Sjsg	strbge	r3, [r0, #-1]!
349c6b709f5Sjsg	ldrbgt	r3, [r1, #-1]!
350c6b709f5Sjsg	strbgt	r3, [r0, #-1]!
351d987040fSdrahn	subs	r2, r2, r12
352d987040fSdrahn	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */
353d987040fSdrahn	ands	r12, r1, #3
354d987040fSdrahn	beq	.Lmemcpy_bt8		/* we have an aligned source */
355d987040fSdrahn
356d987040fSdrahn	/* erg - unaligned source */
357d987040fSdrahn	/* This is where it gets nasty ... */
358d987040fSdrahn.Lmemcpy_bsrcul:
359d987040fSdrahn	bic	r1, r1, #3
360d987040fSdrahn	ldr	r3, [r1, #0]
361d987040fSdrahn	cmp	r12, #2
362d987040fSdrahn	blt	.Lmemcpy_bsrcul1
363d987040fSdrahn	beq	.Lmemcpy_bsrcul2
364d987040fSdrahn	cmp	r2, #0x0c
365d987040fSdrahn	blt	.Lmemcpy_bsrcul3loop4
366d987040fSdrahn	sub	r2, r2, #0x0c
367d987040fSdrahn	stmdb	sp!, {r4, r5, lr}
368d987040fSdrahn
369d987040fSdrahn.Lmemcpy_bsrcul3loop16:
370d987040fSdrahn	mov	lr, r3, lsl #8
371d987040fSdrahn	ldmdb	r1!, {r3-r5, r12}
372d987040fSdrahn	orr	lr, lr, r12, lsr #24
373d987040fSdrahn	mov	r12, r12, lsl #8
374d987040fSdrahn	orr	r12, r12, r5, lsr #24
375d987040fSdrahn	mov	r5, r5, lsl #8
376d987040fSdrahn	orr	r5, r5, r4, lsr #24
377d987040fSdrahn	mov	r4, r4, lsl #8
378d987040fSdrahn	orr	r4, r4, r3, lsr #24
379d987040fSdrahn	stmdb	r0!, {r4, r5, r12, lr}
380d987040fSdrahn	subs	r2, r2, #0x10
381d987040fSdrahn	bge	.Lmemcpy_bsrcul3loop16
382d987040fSdrahn	ldmia	sp!, {r4, r5, lr}
383d987040fSdrahn	adds	r2, r2, #0x0c
384d987040fSdrahn	blt	.Lmemcpy_bsrcul3l4
385d987040fSdrahn
386d987040fSdrahn.Lmemcpy_bsrcul3loop4:
387d987040fSdrahn	mov	r12, r3, lsl #8
388d987040fSdrahn	ldr	r3, [r1, #-4]!
389d987040fSdrahn	orr	r12, r12, r3, lsr #24
390d987040fSdrahn	str	r12, [r0, #-4]!
391d987040fSdrahn	subs	r2, r2, #4
392d987040fSdrahn	bge	.Lmemcpy_bsrcul3loop4
393d987040fSdrahn
394d987040fSdrahn.Lmemcpy_bsrcul3l4:
395d987040fSdrahn	add	r1, r1, #3
396d987040fSdrahn	b	.Lmemcpy_bl4
397d987040fSdrahn
398d987040fSdrahn.Lmemcpy_bsrcul2:
399d987040fSdrahn	cmp	r2, #0x0c
400d987040fSdrahn	blt	.Lmemcpy_bsrcul2loop4
401d987040fSdrahn	sub	r2, r2, #0x0c
402d987040fSdrahn	stmdb	sp!, {r4, r5, lr}
403d987040fSdrahn
404d987040fSdrahn.Lmemcpy_bsrcul2loop16:
405d987040fSdrahn	mov	lr, r3, lsl #16
406d987040fSdrahn	ldmdb	r1!, {r3-r5, r12}
407d987040fSdrahn	orr	lr, lr, r12, lsr #16
408d987040fSdrahn	mov	r12, r12, lsl #16
409d987040fSdrahn	orr	r12, r12, r5, lsr #16
410d987040fSdrahn	mov	r5, r5, lsl #16
411d987040fSdrahn	orr	r5, r5, r4, lsr #16
412d987040fSdrahn	mov	r4, r4, lsl #16
413d987040fSdrahn	orr	r4, r4, r3, lsr #16
414d987040fSdrahn	stmdb	r0!, {r4, r5, r12, lr}
415d987040fSdrahn	subs	r2, r2, #0x10
416d987040fSdrahn	bge	.Lmemcpy_bsrcul2loop16
417d987040fSdrahn	ldmia	sp!, {r4, r5, lr}
418d987040fSdrahn	adds	r2, r2, #0x0c
419d987040fSdrahn	blt	.Lmemcpy_bsrcul2l4
420d987040fSdrahn
421d987040fSdrahn.Lmemcpy_bsrcul2loop4:
422d987040fSdrahn	mov	r12, r3, lsl #16
423d987040fSdrahn	ldr	r3, [r1, #-4]!
424d987040fSdrahn	orr	r12, r12, r3, lsr #16
425d987040fSdrahn	str	r12, [r0, #-4]!
426d987040fSdrahn	subs	r2, r2, #4
427d987040fSdrahn	bge	.Lmemcpy_bsrcul2loop4
428d987040fSdrahn
429d987040fSdrahn.Lmemcpy_bsrcul2l4:
430d987040fSdrahn	add	r1, r1, #2
431d987040fSdrahn	b	.Lmemcpy_bl4
432d987040fSdrahn
433d987040fSdrahn.Lmemcpy_bsrcul1:
434d987040fSdrahn	cmp	r2, #0x0c
435d987040fSdrahn	blt	.Lmemcpy_bsrcul1loop4
436d987040fSdrahn	sub	r2, r2, #0x0c
437d987040fSdrahn	stmdb	sp!, {r4, r5, lr}
438d987040fSdrahn
439d987040fSdrahn.Lmemcpy_bsrcul1loop32:
440d987040fSdrahn	mov	lr, r3, lsl #24
441d987040fSdrahn	ldmdb	r1!, {r3-r5, r12}
442d987040fSdrahn	orr	lr, lr, r12, lsr #8
443d987040fSdrahn	mov	r12, r12, lsl #24
444d987040fSdrahn	orr	r12, r12, r5, lsr #8
445d987040fSdrahn	mov	r5, r5, lsl #24
446d987040fSdrahn	orr	r5, r5, r4, lsr #8
447d987040fSdrahn	mov	r4, r4, lsl #24
448d987040fSdrahn	orr	r4, r4, r3, lsr #8
449d987040fSdrahn	stmdb	r0!, {r4, r5, r12, lr}
450d987040fSdrahn	subs	r2, r2, #0x10
451d987040fSdrahn	bge	.Lmemcpy_bsrcul1loop32
452d987040fSdrahn	ldmia	sp!, {r4, r5, lr}
453d987040fSdrahn	adds	r2, r2, #0x0c
454d987040fSdrahn	blt	.Lmemcpy_bsrcul1l4
455d987040fSdrahn
456d987040fSdrahn.Lmemcpy_bsrcul1loop4:
457d987040fSdrahn	mov	r12, r3, lsl #24
458d987040fSdrahn	ldr	r3, [r1, #-4]!
459d987040fSdrahn	orr	r12, r12, r3, lsr #8
460d987040fSdrahn	str	r12, [r0, #-4]!
461d987040fSdrahn	subs	r2, r2, #4
462d987040fSdrahn	bge	.Lmemcpy_bsrcul1loop4
463d987040fSdrahn
464d987040fSdrahn.Lmemcpy_bsrcul1l4:
465d987040fSdrahn	add	r1, r1, #1
466d987040fSdrahn	b	.Lmemcpy_bl4
467*8ead0783SguentherEND(_memcpy)
468