xref: /openbsd-src/sys/lib/libkern/arch/arm/memcpy.S (revision 6f05df2d9be0954bec42d51d943d77bd250fb664)
1/*	$OpenBSD: memcpy.S,v 1.4 2013/06/15 19:16:53 miod Exp $	*/
2/*	$NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $	*/
3
4/*-
5 * Copyright (c) 1997 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Neil A. Carson and Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include <machine/asm.h>
34
35/*
36 * This is one fun bit of code ...
37 * Some easy listening music is suggested while trying to understand this
38 * code e.g. Iron Maiden
39 *
40 * For anyone attempting to understand it :
41 *
42 * The core code is implemented here with simple stubs for memcpy()
43 * memmove() and bcopy().
44 *
45 * All local labels are prefixed with Lmemcpy_
46 * Following the prefix a label starting f is used in the forward copy code
47 * while a label using b is used in the backwards copy code
48 * The source and destination addresses determine whether a forward or
49 * backward copy is performed.
50 * Separate bits of code are used to deal with the following situations
51 * for both the forward and backwards copy.
52 * unaligned source address
53 * unaligned destination address
54 * Separate copy routines are used to produce an optimised result for each
55 * of these cases.
56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
57 * a time where possible.
58 *
59 * Note: r12 (aka ip) can be trashed during the function along with
60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
61 * Additional registers are preserved prior to use i.e. r4, r5 & lr
62 *
63 * Apologies for the state of the comments ;-)
64 */
65
66ENTRY(memcpy)
67ENTRY_NP(memmove)
68	/* Determine copy direction */
69	cmp	r1, r0
70
71#ifdef __APCS_26__
72	moveqs	pc, lr
73#else
74	moveq	pc, lr
75#endif
76
77	/* save leaf functions having to store this away */
78	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
79
80	bcc	Lmemcpy_backwards
81
82	/* start of forwards copy */
83	subs	r2, r2, #4
84	blt	Lmemcpy_fl4		/* less than 4 bytes */
85	ands	r12, r0, #3
86	bne	Lmemcpy_fdestul		/* oh unaligned destination addr */
87	ands	r12, r1, #3
88	bne	Lmemcpy_fsrcul		/* oh unaligned source addr */
89
90Lmemcpy_ft8:
91	/* We have aligned source and destination */
92	subs	r2, r2, #8
93	blt	Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
94	subs	r2, r2, #0x14
95	blt	Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
96	stmdb	sp!, {r4}		/* borrow r4 */
97
98	/* blat 32 bytes at a time */
99	/* XXX for really big copies perhaps we should use more registers */
100Lmemcpy_floop32:
101	ldmia	r1!, {r3, r4, r12, lr}
102	stmia	r0!, {r3, r4, r12, lr}
103	ldmia	r1!, {r3, r4, r12, lr}
104	stmia	r0!, {r3, r4, r12, lr}
105	subs	r2, r2, #0x20
106	bge	Lmemcpy_floop32
107
108	cmn	r2, #0x10
109	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
110	stmgeia	r0!, {r3, r4, r12, lr}
111	subge	r2, r2, #0x10
112	ldmia	sp!, {r4}		/* return r4 */
113
114Lmemcpy_fl32:
115	adds	r2, r2, #0x14
116
117	/* blat 12 bytes at a time */
118Lmemcpy_floop12:
119	ldmgeia	r1!, {r3, r12, lr}
120	stmgeia	r0!, {r3, r12, lr}
121	subges	r2, r2, #0x0c
122	bge	Lmemcpy_floop12
123
124Lmemcpy_fl12:
125	adds	r2, r2, #8
126	blt	Lmemcpy_fl4
127
128	subs	r2, r2, #4
129	ldrlt	r3, [r1], #4
130	strlt	r3, [r0], #4
131	ldmgeia	r1!, {r3, r12}
132	stmgeia	r0!, {r3, r12}
133	subge	r2, r2, #4
134
135Lmemcpy_fl4:
136	/* less than 4 bytes to go */
137	adds	r2, r2, #4
138#ifdef __APCS_26_
139	ldmeqia sp!, {r0, pc}^		/* done */
140#else
141	ldmeqia	sp!, {r0, pc}		/* done */
142#endif
143	/* copy the crud byte at a time */
144	cmp	r2, #2
145	ldrb	r3, [r1], #1
146	strb	r3, [r0], #1
147	ldrgeb	r3, [r1], #1
148	strgeb	r3, [r0], #1
149	ldrgtb	r3, [r1], #1
150	strgtb	r3, [r0], #1
151#ifdef __APCS_26__
152	ldmia	sp!, {r0, pc}^
153#else
154	ldmia	sp!, {r0, pc}
155#endif
156
157	/* erg - unaligned destination */
158Lmemcpy_fdestul:
159	rsb	r12, r12, #4
160	cmp	r12, #2
161
162	/* align destination with byte copies */
163	ldrb	r3, [r1], #1
164	strb	r3, [r0], #1
165	ldrgeb	r3, [r1], #1
166	strgeb	r3, [r0], #1
167	ldrgtb	r3, [r1], #1
168	strgtb	r3, [r0], #1
169	subs	r2, r2, r12
170	blt	Lmemcpy_fl4		/* less the 4 bytes */
171
172	ands	r12, r1, #3
173	beq	Lmemcpy_ft8		/* we have an aligned source */
174
175	/* erg - unaligned source */
176	/* This is where it gets nasty ... */
177Lmemcpy_fsrcul:
178	bic	r1, r1, #3
179	ldr	lr, [r1], #4
180	cmp	r12, #2
181	bgt	Lmemcpy_fsrcul3
182	beq	Lmemcpy_fsrcul2
183	cmp	r2, #0x0c
184	blt	Lmemcpy_fsrcul1loop4
185	sub	r2, r2, #0x0c
186	stmdb	sp!, {r4, r5}
187
188Lmemcpy_fsrcul1loop16:
189	mov	r3, lr, lsr #8
190	ldmia	r1!, {r4, r5, r12, lr}
191	orr	r3, r3, r4, lsl #24
192	mov	r4, r4, lsr #8
193	orr	r4, r4, r5, lsl #24
194	mov	r5, r5, lsr #8
195	orr	r5, r5, r12, lsl #24
196	mov	r12, r12, lsr #8
197	orr	r12, r12, lr, lsl #24
198	stmia	r0!, {r3-r5, r12}
199	subs	r2, r2, #0x10
200	bge	Lmemcpy_fsrcul1loop16
201	ldmia	sp!, {r4, r5}
202	adds	r2, r2, #0x0c
203	blt	Lmemcpy_fsrcul1l4
204
205Lmemcpy_fsrcul1loop4:
206	mov	r12, lr, lsr #8
207	ldr	lr, [r1], #4
208	orr	r12, r12, lr, lsl #24
209	str	r12, [r0], #4
210	subs	r2, r2, #4
211	bge	Lmemcpy_fsrcul1loop4
212
213Lmemcpy_fsrcul1l4:
214	sub	r1, r1, #3
215	b	Lmemcpy_fl4
216
217Lmemcpy_fsrcul2:
218	cmp	r2, #0x0c
219	blt	Lmemcpy_fsrcul2loop4
220	sub	r2, r2, #0x0c
221	stmdb	sp!, {r4, r5}
222
223Lmemcpy_fsrcul2loop16:
224	mov	r3, lr, lsr #16
225	ldmia	r1!, {r4, r5, r12, lr}
226	orr	r3, r3, r4, lsl #16
227	mov	r4, r4, lsr #16
228	orr	r4, r4, r5, lsl #16
229	mov	r5, r5, lsr #16
230	orr	r5, r5, r12, lsl #16
231	mov	r12, r12, lsr #16
232	orr	r12, r12, lr, lsl #16
233	stmia	r0!, {r3-r5, r12}
234	subs	r2, r2, #0x10
235	bge	Lmemcpy_fsrcul2loop16
236	ldmia	sp!, {r4, r5}
237	adds	r2, r2, #0x0c
238	blt	Lmemcpy_fsrcul2l4
239
240Lmemcpy_fsrcul2loop4:
241	mov	r12, lr, lsr #16
242	ldr	lr, [r1], #4
243	orr	r12, r12, lr, lsl #16
244	str	r12, [r0], #4
245	subs	r2, r2, #4
246	bge	Lmemcpy_fsrcul2loop4
247
248Lmemcpy_fsrcul2l4:
249	sub	r1, r1, #2
250	b	Lmemcpy_fl4
251
252Lmemcpy_fsrcul3:
253	cmp	r2, #0x0c
254	blt	Lmemcpy_fsrcul3loop4
255	sub	r2, r2, #0x0c
256	stmdb	sp!, {r4, r5}
257
258Lmemcpy_fsrcul3loop16:
259	mov	r3, lr, lsr #24
260	ldmia	r1!, {r4, r5, r12, lr}
261	orr	r3, r3, r4, lsl #8
262	mov	r4, r4, lsr #24
263	orr	r4, r4, r5, lsl #8
264	mov	r5, r5, lsr #24
265	orr	r5, r5, r12, lsl #8
266	mov	r12, r12, lsr #24
267	orr	r12, r12, lr, lsl #8
268	stmia	r0!, {r3-r5, r12}
269	subs	r2, r2, #0x10
270	bge	Lmemcpy_fsrcul3loop16
271	ldmia	sp!, {r4, r5}
272	adds	r2, r2, #0x0c
273	blt	Lmemcpy_fsrcul3l4
274
275Lmemcpy_fsrcul3loop4:
276	mov	r12, lr, lsr #24
277	ldr	lr, [r1], #4
278	orr	r12, r12, lr, lsl #8
279	str	r12, [r0], #4
280	subs	r2, r2, #4
281	bge	Lmemcpy_fsrcul3loop4
282
283Lmemcpy_fsrcul3l4:
284	sub	r1, r1, #1
285	b	Lmemcpy_fl4
286
287Lmemcpy_backwards:
288	add	r1, r1, r2
289	add	r0, r0, r2
290	subs	r2, r2, #4
291	blt	Lmemcpy_bl4		/* less than 4 bytes */
292	ands	r12, r0, #3
293	bne	Lmemcpy_bdestul		/* oh unaligned destination addr */
294	ands	r12, r1, #3
295	bne	Lmemcpy_bsrcul		/* oh unaligned source addr */
296
297Lmemcpy_bt8:
298	/* We have aligned source and destination */
299	subs	r2, r2, #8
300	blt	Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
301	stmdb	sp!, {r4}
302	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
303	blt	Lmemcpy_bl32
304
305	/* blat 32 bytes at a time */
306	/* XXX for really big copies perhaps we should use more registers */
307Lmemcpy_bloop32:
308	ldmdb	r1!, {r3, r4, r12, lr}
309	stmdb	r0!, {r3, r4, r12, lr}
310	ldmdb	r1!, {r3, r4, r12, lr}
311	stmdb	r0!, {r3, r4, r12, lr}
312	subs	r2, r2, #0x20
313	bge	Lmemcpy_bloop32
314
315Lmemcpy_bl32:
316	cmn	r2, #0x10
317	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
318	stmgedb	r0!, {r3, r4, r12, lr}
319	subge	r2, r2, #0x10
320	adds	r2, r2, #0x14
321	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
322	stmgedb	r0!, {r3, r12, lr}
323	subge	r2, r2, #0x0c
324	ldmia	sp!, {r4}
325
326Lmemcpy_bl12:
327	adds	r2, r2, #8
328	blt	Lmemcpy_bl4
329	subs	r2, r2, #4
330	ldrlt	r3, [r1, #-4]!
331	strlt	r3, [r0, #-4]!
332	ldmgedb	r1!, {r3, r12}
333	stmgedb	r0!, {r3, r12}
334	subge	r2, r2, #4
335
336Lmemcpy_bl4:
337	/* less than 4 bytes to go */
338	adds	r2, r2, #4
339#ifdef __APCS_26__
340	ldmeqia	sp!, {r0, pc}^
341#else
342	ldmeqia	sp!, {r0, pc}
343#endif
344
345	/* copy the crud byte at a time */
346	cmp	r2, #2
347	ldrb	r3, [r1, #-1]!
348	strb	r3, [r0, #-1]!
349	ldrgeb	r3, [r1, #-1]!
350	strgeb	r3, [r0, #-1]!
351	ldrgtb	r3, [r1, #-1]!
352	strgtb	r3, [r0, #-1]!
353#ifdef __APCS_26__
354	ldmia	sp!, {r0, pc}^
355#else
356	ldmia	sp!, {r0, pc}
357#endif
358
359	/* erg - unaligned destination */
360Lmemcpy_bdestul:
361	cmp	r12, #2
362
363	/* align destination with byte copies */
364	ldrb	r3, [r1, #-1]!
365	strb	r3, [r0, #-1]!
366	ldrgeb	r3, [r1, #-1]!
367	strgeb	r3, [r0, #-1]!
368	ldrgtb	r3, [r1, #-1]!
369	strgtb	r3, [r0, #-1]!
370	subs	r2, r2, r12
371	blt	Lmemcpy_bl4		/* less than 4 bytes to go */
372	ands	r12, r1, #3
373	beq	Lmemcpy_bt8		/* we have an aligned source */
374
375	/* erg - unaligned source */
376	/* This is where it gets nasty ... */
377Lmemcpy_bsrcul:
378	bic	r1, r1, #3
379	ldr	r3, [r1, #0]
380	cmp	r12, #2
381	blt	Lmemcpy_bsrcul1
382	beq	Lmemcpy_bsrcul2
383	cmp	r2, #0x0c
384	blt	Lmemcpy_bsrcul3loop4
385	sub	r2, r2, #0x0c
386	stmdb	sp!, {r4, r5}
387
388Lmemcpy_bsrcul3loop16:
389	mov	lr, r3, lsl #8
390	ldmdb	r1!, {r3-r5, r12}
391	orr	lr, lr, r12, lsr #24
392	mov	r12, r12, lsl #8
393	orr	r12, r12, r5, lsr #24
394	mov	r5, r5, lsl #8
395	orr	r5, r5, r4, lsr #24
396	mov	r4, r4, lsl #8
397	orr	r4, r4, r3, lsr #24
398	stmdb	r0!, {r4, r5, r12, lr}
399	subs	r2, r2, #0x10
400	bge	Lmemcpy_bsrcul3loop16
401	ldmia	sp!, {r4, r5}
402	adds	r2, r2, #0x0c
403	blt	Lmemcpy_bsrcul3l4
404
405Lmemcpy_bsrcul3loop4:
406	mov	r12, r3, lsl #8
407	ldr	r3, [r1, #-4]!
408	orr	r12, r12, r3, lsr #24
409	str	r12, [r0, #-4]!
410	subs	r2, r2, #4
411	bge	Lmemcpy_bsrcul3loop4
412
413Lmemcpy_bsrcul3l4:
414	add	r1, r1, #3
415	b	Lmemcpy_bl4
416
417Lmemcpy_bsrcul2:
418	cmp	r2, #0x0c
419	blt	Lmemcpy_bsrcul2loop4
420	sub	r2, r2, #0x0c
421	stmdb	sp!, {r4, r5}
422
423Lmemcpy_bsrcul2loop16:
424	mov	lr, r3, lsl #16
425	ldmdb	r1!, {r3-r5, r12}
426	orr	lr, lr, r12, lsr #16
427	mov	r12, r12, lsl #16
428	orr	r12, r12, r5, lsr #16
429	mov	r5, r5, lsl #16
430	orr	r5, r5, r4, lsr #16
431	mov	r4, r4, lsl #16
432	orr	r4, r4, r3, lsr #16
433	stmdb	r0!, {r4, r5, r12, lr}
434	subs	r2, r2, #0x10
435	bge	Lmemcpy_bsrcul2loop16
436	ldmia	sp!, {r4, r5}
437	adds	r2, r2, #0x0c
438	blt	Lmemcpy_bsrcul2l4
439
440Lmemcpy_bsrcul2loop4:
441	mov	r12, r3, lsl #16
442	ldr	r3, [r1, #-4]!
443	orr	r12, r12, r3, lsr #16
444	str	r12, [r0, #-4]!
445	subs	r2, r2, #4
446	bge	Lmemcpy_bsrcul2loop4
447
448Lmemcpy_bsrcul2l4:
449	add	r1, r1, #2
450	b	Lmemcpy_bl4
451
452Lmemcpy_bsrcul1:
453	cmp	r2, #0x0c
454	blt	Lmemcpy_bsrcul1loop4
455	sub	r2, r2, #0x0c
456	stmdb	sp!, {r4, r5}
457
458Lmemcpy_bsrcul1loop32:
459	mov	lr, r3, lsl #24
460	ldmdb	r1!, {r3-r5, r12}
461	orr	lr, lr, r12, lsr #8
462	mov	r12, r12, lsl #24
463	orr	r12, r12, r5, lsr #8
464	mov	r5, r5, lsl #24
465	orr	r5, r5, r4, lsr #8
466	mov	r4, r4, lsl #24
467	orr	r4, r4, r3, lsr #8
468	stmdb	r0!, {r4, r5, r12, lr}
469	subs	r2, r2, #0x10
470	bge	Lmemcpy_bsrcul1loop32
471	ldmia	sp!, {r4, r5}
472	adds	r2, r2, #0x0c
473	blt	Lmemcpy_bsrcul1l4
474
475Lmemcpy_bsrcul1loop4:
476	mov	r12, r3, lsl #24
477	ldr	r3, [r1, #-4]!
478	orr	r12, r12, r3, lsr #8
479	str	r12, [r0, #-4]!
480	subs	r2, r2, #4
481	bge	Lmemcpy_bsrcul1loop4
482
483Lmemcpy_bsrcul1l4:
484	add	r1, r1, #1
485	b	Lmemcpy_bl4
486
487