xref: /netbsd-src/common/lib/libc/arch/arm/string/memcpy_arm.S (revision ba65fde2d7fefa7d39838fa5fa855e62bd606b5e)
1/*	$NetBSD: memcpy_arm.S,v 1.3 2013/01/28 06:23:44 matt Exp $	*/
2
3/*-
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Neil A. Carson and Mark Brinicombe
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <machine/asm.h>
33
34#if defined(__ARM_EABI__)
35STRONG_ALIAS(__aeabi_memcpy, memcpy)
36#endif
37
38/*
39 * This is one fun bit of code ...
40 * Some easy listening music is suggested while trying to understand this
41 * code e.g. Iron Maiden
42 *
43 * For anyone attempting to understand it :
44 *
45 * The core code is implemented here with simple stubs for memcpy().
46 *
47 * All local labels are prefixed with Lmemcpy_
48 * Following the prefix a label starting f is used in the forward copy code
49 * while a label using b is used in the backwards copy code
50 * The source and destination addresses determine whether a forward or
51 * backward copy is performed.
52 * Separate bits of code are used to deal with the following situations
53 * for both the forward and backwards copy.
54 * unaligned source address
55 * unaligned destination address
56 * Separate copy routines are used to produce an optimised result for each
57 * of these cases.
58 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
59 * a time where possible.
60 *
61 * Note: r12 (aka ip) can be trashed during the function along with
62 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
63 * Additional registers are preserved prior to use i.e. r4, r5 & lr
64 *
65 * Apologies for the state of the comments ;-)
66 */
67/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
68ENTRY(memcpy)
69	/* save leaf functions having to store this away */
70	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
71
72	subs	r2, r2, #4
73	blt	.Lmemcpy_l4		/* less than 4 bytes */
74	ands	r12, r0, #3
75	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
76	ands	r12, r1, #3
77	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
78
79.Lmemcpy_t8:
80	/* We have aligned source and destination */
81	subs	r2, r2, #8
82	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
83	subs	r2, r2, #0x14
84	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
85	stmdb	sp!, {r4}		/* borrow r4 */
86
87	/* blat 32 bytes at a time */
88	/* XXX for really big copies perhaps we should use more registers */
89.Lmemcpy_loop32:
90	ldmia	r1!, {r3, r4, r12, lr}
91	stmia	r0!, {r3, r4, r12, lr}
92	ldmia	r1!, {r3, r4, r12, lr}
93	stmia	r0!, {r3, r4, r12, lr}
94	subs	r2, r2, #0x20
95	bge	.Lmemcpy_loop32
96
97	cmn	r2, #0x10
98	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
99	stmgeia	r0!, {r3, r4, r12, lr}
100	subge	r2, r2, #0x10
101	ldmia	sp!, {r4}		/* return r4 */
102
103.Lmemcpy_l32:
104	adds	r2, r2, #0x14
105
106	/* blat 12 bytes at a time */
107.Lmemcpy_loop12:
108	ldmgeia	r1!, {r3, r12, lr}
109	stmgeia	r0!, {r3, r12, lr}
110	subges	r2, r2, #0x0c
111	bge	.Lmemcpy_loop12
112
113.Lmemcpy_l12:
114	adds	r2, r2, #8
115	blt	.Lmemcpy_l4
116
117	subs	r2, r2, #4
118	ldrlt	r3, [r1], #4
119	strlt	r3, [r0], #4
120	ldmgeia	r1!, {r3, r12}
121	stmgeia	r0!, {r3, r12}
122	subge	r2, r2, #4
123
124.Lmemcpy_l4:
125	/* less than 4 bytes to go */
126	adds	r2, r2, #4
127#ifdef __APCS_26_
128	ldmeqia sp!, {r0, pc}^		/* done */
129#else
130	ldmeqia	sp!, {r0, pc}		/* done */
131#endif
132	/* copy the crud byte at a time */
133	cmp	r2, #2
134	ldrb	r3, [r1], #1
135	strb	r3, [r0], #1
136	ldrgeb	r3, [r1], #1
137	strgeb	r3, [r0], #1
138	ldrgtb	r3, [r1], #1
139	strgtb	r3, [r0], #1
140	ldmia	sp!, {r0, pc}
141
142	/* erg - unaligned destination */
143.Lmemcpy_destul:
144	rsb	r12, r12, #4
145	cmp	r12, #2
146
147	/* align destination with byte copies */
148	ldrb	r3, [r1], #1
149	strb	r3, [r0], #1
150	ldrgeb	r3, [r1], #1
151	strgeb	r3, [r0], #1
152	ldrgtb	r3, [r1], #1
153	strgtb	r3, [r0], #1
154	subs	r2, r2, r12
155	blt	.Lmemcpy_l4		/* less the 4 bytes */
156
157	ands	r12, r1, #3
158	beq	.Lmemcpy_t8		/* we have an aligned source */
159
160	/* erg - unaligned source */
161	/* This is where it gets nasty ... */
162.Lmemcpy_srcul:
163	bic	r1, r1, #3
164	ldr	lr, [r1], #4
165	cmp	r12, #2
166	bgt	.Lmemcpy_srcul3
167	beq	.Lmemcpy_srcul2
168	cmp	r2, #0x0c
169	blt	.Lmemcpy_srcul1loop4
170	sub	r2, r2, #0x0c
171	stmdb	sp!, {r4, r5}
172
173.Lmemcpy_srcul1loop16:
174#ifdef __ARMEB__
175	mov	r3, lr, lsl #8
176#else
177	mov	r3, lr, lsr #8
178#endif
179	ldmia	r1!, {r4, r5, r12, lr}
180#ifdef __ARMEB__
181	orr	r3, r3, r4, lsr #24
182	mov	r4, r4, lsl #8
183	orr	r4, r4, r5, lsr #24
184	mov	r5, r5, lsl #8
185	orr	r5, r5, r12, lsr #24
186	mov	r12, r12, lsl #8
187	orr	r12, r12, lr, lsr #24
188#else
189	orr	r3, r3, r4, lsl #24
190	mov	r4, r4, lsr #8
191	orr	r4, r4, r5, lsl #24
192	mov	r5, r5, lsr #8
193	orr	r5, r5, r12, lsl #24
194	mov	r12, r12, lsr #8
195	orr	r12, r12, lr, lsl #24
196#endif
197	stmia	r0!, {r3-r5, r12}
198	subs	r2, r2, #0x10
199	bge	.Lmemcpy_srcul1loop16
200	ldmia	sp!, {r4, r5}
201	adds	r2, r2, #0x0c
202	blt	.Lmemcpy_srcul1l4
203
204.Lmemcpy_srcul1loop4:
205#ifdef __ARMEB__
206	mov	r12, lr, lsl #8
207#else
208	mov	r12, lr, lsr #8
209#endif
210	ldr	lr, [r1], #4
211#ifdef __ARMEB__
212	orr	r12, r12, lr, lsr #24
213#else
214	orr	r12, r12, lr, lsl #24
215#endif
216	str	r12, [r0], #4
217	subs	r2, r2, #4
218	bge	.Lmemcpy_srcul1loop4
219
220.Lmemcpy_srcul1l4:
221	sub	r1, r1, #3
222	b	.Lmemcpy_l4
223
224.Lmemcpy_srcul2:
225	cmp	r2, #0x0c
226	blt	.Lmemcpy_srcul2loop4
227	sub	r2, r2, #0x0c
228	stmdb	sp!, {r4, r5}
229
230.Lmemcpy_srcul2loop16:
231#ifdef __ARMEB__
232	mov	r3, lr, lsl #16
233#else
234	mov	r3, lr, lsr #16
235#endif
236	ldmia	r1!, {r4, r5, r12, lr}
237#ifdef __ARMEB__
238	orr	r3, r3, r4, lsr #16
239	mov	r4, r4, lsl #16
240	orr	r4, r4, r5, lsr #16
241	mov	r5, r5, lsl #16
242	orr	r5, r5, r12, lsr #16
243	mov	r12, r12, lsl #16
244	orr	r12, r12, lr, lsr #16
245#else
246	orr	r3, r3, r4, lsl #16
247	mov	r4, r4, lsr #16
248	orr	r4, r4, r5, lsl #16
249	mov	r5, r5, lsr #16
250	orr	r5, r5, r12, lsl #16
251	mov	r12, r12, lsr #16
252	orr	r12, r12, lr, lsl #16
253#endif
254	stmia	r0!, {r3-r5, r12}
255	subs	r2, r2, #0x10
256	bge	.Lmemcpy_srcul2loop16
257	ldmia	sp!, {r4, r5}
258	adds	r2, r2, #0x0c
259	blt	.Lmemcpy_srcul2l4
260
261.Lmemcpy_srcul2loop4:
262#ifdef __ARMEB__
263	mov	r12, lr, lsl #16
264#else
265	mov	r12, lr, lsr #16
266#endif
267	ldr	lr, [r1], #4
268#ifdef __ARMEB__
269	orr	r12, r12, lr, lsr #16
270#else
271	orr	r12, r12, lr, lsl #16
272#endif
273	str	r12, [r0], #4
274	subs	r2, r2, #4
275	bge	.Lmemcpy_srcul2loop4
276
277.Lmemcpy_srcul2l4:
278	sub	r1, r1, #2
279	b	.Lmemcpy_l4
280
281.Lmemcpy_srcul3:
282	cmp	r2, #0x0c
283	blt	.Lmemcpy_srcul3loop4
284	sub	r2, r2, #0x0c
285	stmdb	sp!, {r4, r5}
286
287.Lmemcpy_srcul3loop16:
288#ifdef __ARMEB__
289	mov	r3, lr, lsl #24
290#else
291	mov	r3, lr, lsr #24
292#endif
293	ldmia	r1!, {r4, r5, r12, lr}
294#ifdef __ARMEB__
295	orr	r3, r3, r4, lsr #8
296	mov	r4, r4, lsl #24
297	orr	r4, r4, r5, lsr #8
298	mov	r5, r5, lsl #24
299	orr	r5, r5, r12, lsr #8
300	mov	r12, r12, lsl #24
301	orr	r12, r12, lr, lsr #8
302#else
303	orr	r3, r3, r4, lsl #8
304	mov	r4, r4, lsr #24
305	orr	r4, r4, r5, lsl #8
306	mov	r5, r5, lsr #24
307	orr	r5, r5, r12, lsl #8
308	mov	r12, r12, lsr #24
309	orr	r12, r12, lr, lsl #8
310#endif
311	stmia	r0!, {r3-r5, r12}
312	subs	r2, r2, #0x10
313	bge	.Lmemcpy_srcul3loop16
314	ldmia	sp!, {r4, r5}
315	adds	r2, r2, #0x0c
316	blt	.Lmemcpy_srcul3l4
317
318.Lmemcpy_srcul3loop4:
319#ifdef __ARMEB__
320	mov	r12, lr, lsl #24
321#else
322	mov	r12, lr, lsr #24
323#endif
324	ldr	lr, [r1], #4
325#ifdef __ARMEB__
326	orr	r12, r12, lr, lsr #8
327#else
328	orr	r12, r12, lr, lsl #8
329#endif
330	str	r12, [r0], #4
331	subs	r2, r2, #4
332	bge	.Lmemcpy_srcul3loop4
333
334.Lmemcpy_srcul3l4:
335	sub	r1, r1, #1
336	b	.Lmemcpy_l4
337