xref: /openbsd-src/sys/lib/libkern/arch/arm/memcpy.S (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1/*	$OpenBSD: memcpy.S,v 1.3 2008/06/26 05:42:20 ray Exp $	*/
2/*	$NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $	*/
3
4/*-
5 * Copyright (c) 1997 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Neil A. Carson and Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include <machine/asm.h>
34
35/*
36 * This is one fun bit of code ...
37 * Some easy listening music is suggested while trying to understand this
38 * code e.g. Iron Maiden
39 *
40 * For anyone attempting to understand it :
41 *
42 * The core code is implemented here with simple stubs for memcpy()
43 * memmove() and bcopy().
44 *
45 * All local labels are prefixed with Lmemcpy_
46 * Following the prefix a label starting f is used in the forward copy code
47 * while a label using b is used in the backwards copy code
48 * The source and destination addresses determine whether a forward or
49 * backward copy is performed.
50 * Separate bits of code are used to deal with the following situations
51 * for both the forward and backwards copy.
52 * unaligned source address
53 * unaligned destination address
54 * Separate copy routines are used to produce an optimised result for each
55 * of these cases.
56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
57 * a time where possible.
58 *
59 * Note: r12 (aka ip) can be trashed during the function along with
60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
61 * Additional registers are preserved prior to use i.e. r4, r5 & lr
62 *
63 * Apologies for the state of the comments ;-)
64 */
65
66ENTRY(memcpy)
67ENTRY_NP(memmove)
68	/* Determine copy direction */
69	cmp	r1, r0
70
71	moveq	r0, #0			/* Quick abort for len=0 */
72#ifdef __APCS_26__
73	moveqs	pc, lr
74#else
75	moveq	pc, lr
76#endif
77
78	/* save leaf functions having to store this away */
79	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
80
81	bcc	Lmemcpy_backwards
82
83	/* start of forwards copy */
84	subs	r2, r2, #4
85	blt	Lmemcpy_fl4		/* less than 4 bytes */
86	ands	r12, r0, #3
87	bne	Lmemcpy_fdestul		/* oh unaligned destination addr */
88	ands	r12, r1, #3
89	bne	Lmemcpy_fsrcul		/* oh unaligned source addr */
90
91Lmemcpy_ft8:
92	/* We have aligned source and destination */
93	subs	r2, r2, #8
94	blt	Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
95	subs	r2, r2, #0x14
96	blt	Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
97	stmdb	sp!, {r4}		/* borrow r4 */
98
99	/* blat 32 bytes at a time */
100	/* XXX for really big copies perhaps we should use more registers */
101Lmemcpy_floop32:
102	ldmia	r1!, {r3, r4, r12, lr}
103	stmia	r0!, {r3, r4, r12, lr}
104	ldmia	r1!, {r3, r4, r12, lr}
105	stmia	r0!, {r3, r4, r12, lr}
106	subs	r2, r2, #0x20
107	bge	Lmemcpy_floop32
108
109	cmn	r2, #0x10
110	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
111	stmgeia	r0!, {r3, r4, r12, lr}
112	subge	r2, r2, #0x10
113	ldmia	sp!, {r4}		/* return r4 */
114
115Lmemcpy_fl32:
116	adds	r2, r2, #0x14
117
118	/* blat 12 bytes at a time */
119Lmemcpy_floop12:
120	ldmgeia	r1!, {r3, r12, lr}
121	stmgeia	r0!, {r3, r12, lr}
122	subges	r2, r2, #0x0c
123	bge	Lmemcpy_floop12
124
125Lmemcpy_fl12:
126	adds	r2, r2, #8
127	blt	Lmemcpy_fl4
128
129	subs	r2, r2, #4
130	ldrlt	r3, [r1], #4
131	strlt	r3, [r0], #4
132	ldmgeia	r1!, {r3, r12}
133	stmgeia	r0!, {r3, r12}
134	subge	r2, r2, #4
135
136Lmemcpy_fl4:
137	/* less than 4 bytes to go */
138	adds	r2, r2, #4
139#ifdef __APCS_26_
140	ldmeqia sp!, {r0, pc}^		/* done */
141#else
142	ldmeqia	sp!, {r0, pc}		/* done */
143#endif
144	/* copy the crud byte at a time */
145	cmp	r2, #2
146	ldrb	r3, [r1], #1
147	strb	r3, [r0], #1
148	ldrgeb	r3, [r1], #1
149	strgeb	r3, [r0], #1
150	ldrgtb	r3, [r1], #1
151	strgtb	r3, [r0], #1
152#ifdef __APCS_26__
153	ldmia	sp!, {r0, pc}^
154#else
155	ldmia	sp!, {r0, pc}
156#endif
157
158	/* erg - unaligned destination */
159Lmemcpy_fdestul:
160	rsb	r12, r12, #4
161	cmp	r12, #2
162
163	/* align destination with byte copies */
164	ldrb	r3, [r1], #1
165	strb	r3, [r0], #1
166	ldrgeb	r3, [r1], #1
167	strgeb	r3, [r0], #1
168	ldrgtb	r3, [r1], #1
169	strgtb	r3, [r0], #1
170	subs	r2, r2, r12
171	blt	Lmemcpy_fl4		/* less the 4 bytes */
172
173	ands	r12, r1, #3
174	beq	Lmemcpy_ft8		/* we have an aligned source */
175
176	/* erg - unaligned source */
177	/* This is where it gets nasty ... */
178Lmemcpy_fsrcul:
179	bic	r1, r1, #3
180	ldr	lr, [r1], #4
181	cmp	r12, #2
182	bgt	Lmemcpy_fsrcul3
183	beq	Lmemcpy_fsrcul2
184	cmp	r2, #0x0c
185	blt	Lmemcpy_fsrcul1loop4
186	sub	r2, r2, #0x0c
187	stmdb	sp!, {r4, r5}
188
189Lmemcpy_fsrcul1loop16:
190	mov	r3, lr, lsr #8
191	ldmia	r1!, {r4, r5, r12, lr}
192	orr	r3, r3, r4, lsl #24
193	mov	r4, r4, lsr #8
194	orr	r4, r4, r5, lsl #24
195	mov	r5, r5, lsr #8
196	orr	r5, r5, r12, lsl #24
197	mov	r12, r12, lsr #8
198	orr	r12, r12, lr, lsl #24
199	stmia	r0!, {r3-r5, r12}
200	subs	r2, r2, #0x10
201	bge	Lmemcpy_fsrcul1loop16
202	ldmia	sp!, {r4, r5}
203	adds	r2, r2, #0x0c
204	blt	Lmemcpy_fsrcul1l4
205
206Lmemcpy_fsrcul1loop4:
207	mov	r12, lr, lsr #8
208	ldr	lr, [r1], #4
209	orr	r12, r12, lr, lsl #24
210	str	r12, [r0], #4
211	subs	r2, r2, #4
212	bge	Lmemcpy_fsrcul1loop4
213
214Lmemcpy_fsrcul1l4:
215	sub	r1, r1, #3
216	b	Lmemcpy_fl4
217
218Lmemcpy_fsrcul2:
219	cmp	r2, #0x0c
220	blt	Lmemcpy_fsrcul2loop4
221	sub	r2, r2, #0x0c
222	stmdb	sp!, {r4, r5}
223
224Lmemcpy_fsrcul2loop16:
225	mov	r3, lr, lsr #16
226	ldmia	r1!, {r4, r5, r12, lr}
227	orr	r3, r3, r4, lsl #16
228	mov	r4, r4, lsr #16
229	orr	r4, r4, r5, lsl #16
230	mov	r5, r5, lsr #16
231	orr	r5, r5, r12, lsl #16
232	mov	r12, r12, lsr #16
233	orr	r12, r12, lr, lsl #16
234	stmia	r0!, {r3-r5, r12}
235	subs	r2, r2, #0x10
236	bge	Lmemcpy_fsrcul2loop16
237	ldmia	sp!, {r4, r5}
238	adds	r2, r2, #0x0c
239	blt	Lmemcpy_fsrcul2l4
240
241Lmemcpy_fsrcul2loop4:
242	mov	r12, lr, lsr #16
243	ldr	lr, [r1], #4
244	orr	r12, r12, lr, lsl #16
245	str	r12, [r0], #4
246	subs	r2, r2, #4
247	bge	Lmemcpy_fsrcul2loop4
248
249Lmemcpy_fsrcul2l4:
250	sub	r1, r1, #2
251	b	Lmemcpy_fl4
252
253Lmemcpy_fsrcul3:
254	cmp	r2, #0x0c
255	blt	Lmemcpy_fsrcul3loop4
256	sub	r2, r2, #0x0c
257	stmdb	sp!, {r4, r5}
258
259Lmemcpy_fsrcul3loop16:
260	mov	r3, lr, lsr #24
261	ldmia	r1!, {r4, r5, r12, lr}
262	orr	r3, r3, r4, lsl #8
263	mov	r4, r4, lsr #24
264	orr	r4, r4, r5, lsl #8
265	mov	r5, r5, lsr #24
266	orr	r5, r5, r12, lsl #8
267	mov	r12, r12, lsr #24
268	orr	r12, r12, lr, lsl #8
269	stmia	r0!, {r3-r5, r12}
270	subs	r2, r2, #0x10
271	bge	Lmemcpy_fsrcul3loop16
272	ldmia	sp!, {r4, r5}
273	adds	r2, r2, #0x0c
274	blt	Lmemcpy_fsrcul3l4
275
276Lmemcpy_fsrcul3loop4:
277	mov	r12, lr, lsr #24
278	ldr	lr, [r1], #4
279	orr	r12, r12, lr, lsl #8
280	str	r12, [r0], #4
281	subs	r2, r2, #4
282	bge	Lmemcpy_fsrcul3loop4
283
284Lmemcpy_fsrcul3l4:
285	sub	r1, r1, #1
286	b	Lmemcpy_fl4
287
288Lmemcpy_backwards:
289	add	r1, r1, r2
290	add	r0, r0, r2
291	subs	r2, r2, #4
292	blt	Lmemcpy_bl4		/* less than 4 bytes */
293	ands	r12, r0, #3
294	bne	Lmemcpy_bdestul		/* oh unaligned destination addr */
295	ands	r12, r1, #3
296	bne	Lmemcpy_bsrcul		/* oh unaligned source addr */
297
298Lmemcpy_bt8:
299	/* We have aligned source and destination */
300	subs	r2, r2, #8
301	blt	Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
302	stmdb	sp!, {r4}
303	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
304	blt	Lmemcpy_bl32
305
306	/* blat 32 bytes at a time */
307	/* XXX for really big copies perhaps we should use more registers */
308Lmemcpy_bloop32:
309	ldmdb	r1!, {r3, r4, r12, lr}
310	stmdb	r0!, {r3, r4, r12, lr}
311	ldmdb	r1!, {r3, r4, r12, lr}
312	stmdb	r0!, {r3, r4, r12, lr}
313	subs	r2, r2, #0x20
314	bge	Lmemcpy_bloop32
315
316Lmemcpy_bl32:
317	cmn	r2, #0x10
318	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
319	stmgedb	r0!, {r3, r4, r12, lr}
320	subge	r2, r2, #0x10
321	adds	r2, r2, #0x14
322	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
323	stmgedb	r0!, {r3, r12, lr}
324	subge	r2, r2, #0x0c
325	ldmia	sp!, {r4}
326
327Lmemcpy_bl12:
328	adds	r2, r2, #8
329	blt	Lmemcpy_bl4
330	subs	r2, r2, #4
331	ldrlt	r3, [r1, #-4]!
332	strlt	r3, [r0, #-4]!
333	ldmgedb	r1!, {r3, r12}
334	stmgedb	r0!, {r3, r12}
335	subge	r2, r2, #4
336
337Lmemcpy_bl4:
338	/* less than 4 bytes to go */
339	adds	r2, r2, #4
340#ifdef __APCS_26__
341	ldmeqia	sp!, {r0, pc}^
342#else
343	ldmeqia	sp!, {r0, pc}
344#endif
345
346	/* copy the crud byte at a time */
347	cmp	r2, #2
348	ldrb	r3, [r1, #-1]!
349	strb	r3, [r0, #-1]!
350	ldrgeb	r3, [r1, #-1]!
351	strgeb	r3, [r0, #-1]!
352	ldrgtb	r3, [r1, #-1]!
353	strgtb	r3, [r0, #-1]!
354#ifdef __APCS_26__
355	ldmia	sp!, {r0, pc}^
356#else
357	ldmia	sp!, {r0, pc}
358#endif
359
360	/* erg - unaligned destination */
361Lmemcpy_bdestul:
362	cmp	r12, #2
363
364	/* align destination with byte copies */
365	ldrb	r3, [r1, #-1]!
366	strb	r3, [r0, #-1]!
367	ldrgeb	r3, [r1, #-1]!
368	strgeb	r3, [r0, #-1]!
369	ldrgtb	r3, [r1, #-1]!
370	strgtb	r3, [r0, #-1]!
371	subs	r2, r2, r12
372	blt	Lmemcpy_bl4		/* less than 4 bytes to go */
373	ands	r12, r1, #3
374	beq	Lmemcpy_bt8		/* we have an aligned source */
375
376	/* erg - unaligned source */
377	/* This is where it gets nasty ... */
378Lmemcpy_bsrcul:
379	bic	r1, r1, #3
380	ldr	r3, [r1, #0]
381	cmp	r12, #2
382	blt	Lmemcpy_bsrcul1
383	beq	Lmemcpy_bsrcul2
384	cmp	r2, #0x0c
385	blt	Lmemcpy_bsrcul3loop4
386	sub	r2, r2, #0x0c
387	stmdb	sp!, {r4, r5}
388
389Lmemcpy_bsrcul3loop16:
390	mov	lr, r3, lsl #8
391	ldmdb	r1!, {r3-r5, r12}
392	orr	lr, lr, r12, lsr #24
393	mov	r12, r12, lsl #8
394	orr	r12, r12, r5, lsr #24
395	mov	r5, r5, lsl #8
396	orr	r5, r5, r4, lsr #24
397	mov	r4, r4, lsl #8
398	orr	r4, r4, r3, lsr #24
399	stmdb	r0!, {r4, r5, r12, lr}
400	subs	r2, r2, #0x10
401	bge	Lmemcpy_bsrcul3loop16
402	ldmia	sp!, {r4, r5}
403	adds	r2, r2, #0x0c
404	blt	Lmemcpy_bsrcul3l4
405
406Lmemcpy_bsrcul3loop4:
407	mov	r12, r3, lsl #8
408	ldr	r3, [r1, #-4]!
409	orr	r12, r12, r3, lsr #24
410	str	r12, [r0, #-4]!
411	subs	r2, r2, #4
412	bge	Lmemcpy_bsrcul3loop4
413
414Lmemcpy_bsrcul3l4:
415	add	r1, r1, #3
416	b	Lmemcpy_bl4
417
418Lmemcpy_bsrcul2:
419	cmp	r2, #0x0c
420	blt	Lmemcpy_bsrcul2loop4
421	sub	r2, r2, #0x0c
422	stmdb	sp!, {r4, r5}
423
424Lmemcpy_bsrcul2loop16:
425	mov	lr, r3, lsl #16
426	ldmdb	r1!, {r3-r5, r12}
427	orr	lr, lr, r12, lsr #16
428	mov	r12, r12, lsl #16
429	orr	r12, r12, r5, lsr #16
430	mov	r5, r5, lsl #16
431	orr	r5, r5, r4, lsr #16
432	mov	r4, r4, lsl #16
433	orr	r4, r4, r3, lsr #16
434	stmdb	r0!, {r4, r5, r12, lr}
435	subs	r2, r2, #0x10
436	bge	Lmemcpy_bsrcul2loop16
437	ldmia	sp!, {r4, r5}
438	adds	r2, r2, #0x0c
439	blt	Lmemcpy_bsrcul2l4
440
441Lmemcpy_bsrcul2loop4:
442	mov	r12, r3, lsl #16
443	ldr	r3, [r1, #-4]!
444	orr	r12, r12, r3, lsr #16
445	str	r12, [r0, #-4]!
446	subs	r2, r2, #4
447	bge	Lmemcpy_bsrcul2loop4
448
449Lmemcpy_bsrcul2l4:
450	add	r1, r1, #2
451	b	Lmemcpy_bl4
452
453Lmemcpy_bsrcul1:
454	cmp	r2, #0x0c
455	blt	Lmemcpy_bsrcul1loop4
456	sub	r2, r2, #0x0c
457	stmdb	sp!, {r4, r5}
458
459Lmemcpy_bsrcul1loop32:
460	mov	lr, r3, lsl #24
461	ldmdb	r1!, {r3-r5, r12}
462	orr	lr, lr, r12, lsr #8
463	mov	r12, r12, lsl #24
464	orr	r12, r12, r5, lsr #8
465	mov	r5, r5, lsl #24
466	orr	r5, r5, r4, lsr #8
467	mov	r4, r4, lsl #24
468	orr	r4, r4, r3, lsr #8
469	stmdb	r0!, {r4, r5, r12, lr}
470	subs	r2, r2, #0x10
471	bge	Lmemcpy_bsrcul1loop32
472	ldmia	sp!, {r4, r5}
473	adds	r2, r2, #0x0c
474	blt	Lmemcpy_bsrcul1l4
475
476Lmemcpy_bsrcul1loop4:
477	mov	r12, r3, lsl #24
478	ldr	r3, [r1, #-4]!
479	orr	r12, r12, r3, lsr #8
480	str	r12, [r0, #-4]!
481	subs	r2, r2, #4
482	bge	Lmemcpy_bsrcul1loop4
483
484Lmemcpy_bsrcul1l4:
485	add	r1, r1, #1
486	b	Lmemcpy_bl4
487
488