xref: /netbsd-src/common/lib/libc/arch/aarch64/string/bcopy.S (revision 90313c06e62e910bf0d1bb24faa9d17dcefd0ab6)
1/* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */
2
3/*
4 * Copyright (c) 2018 Ryo Shimizu
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <machine/asm.h>
30
31#if defined(LIBC_SCCS)
32RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $")
33#endif
34
35#if defined(MEMCOPY)
36
37/*
38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 */
40#define FUNCTION		memcpy
41#define NO_OVERLAP
42#define SRC0			x1
43#define DST0			x0
44#define LEN			x2
45
46#elif defined(MEMMOVE)
47
48/*
49 * void *memmove(void *dst, const void *src, size_t len);
50 */
51#define FUNCTION		memmove
52#undef NO_OVERLAP
53#define SRC0			x1
54#define DST0			x0
55#define LEN			x2
56
57#else /* !MEMCOPY && !MEMMOVE */
58
59/*
60 * void bcopy(const void *src, void *dst, size_t len);
61 */
62#define FUNCTION		bcopy
63#define NO_OVERLAP
64#define SRC0			x0
65#define DST0			x1
66#define LEN			x2
67
68#endif /* MEMCOPY/MEMMOVE/BCOPY */
69
70/* caller-saved temporary registers. breakable. */
71#define TMP_X			x3
72#define TMP_Xw			w3
73#define TMP_D			x4
74#define TMP_S			x5
75#define DST			x6
76#define SRC			x7
77#define DATA0			x8
78#define DATA0w			w8
79#define DATA1			x9
80#define DATA1w			w9
81#define DATA2			x10
82#define SRC_ALIGNBIT		x11	/* (SRC & 7) * 8 */
83#define DST_ALIGNBIT		x12	/* (DST & 7) * 8 */
84#define SRC_DST_ALIGNBIT	x13	/* = SRC_ALIGNBIT - DST_ALIGNBIT */
85#define DST_SRC_ALIGNBIT	x14	/* = -SRC_DST_ALIGNBIT */
86
87#define STP_ALIGN		16	/* align before stp/ldp. 8 or 16 */
88#define SMALLSIZE		32
89
90	.text
91	.align	5
92
93#ifndef NO_OVERLAP
94#ifndef STRICT_ALIGNMENT
95backward_ignore_align:
96	prfm	PLDL1KEEP, [SRC0]
97	add	SRC0, SRC0, LEN
98	add	DST, DST0, LEN
99	cmp	LEN, #SMALLSIZE
100	bcs	copy_backward
101copy_backward_small:
102	cmp	LEN, #8
103	bcs	9f
104
105	/* 0 <= len < 8 */
106	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107	tbz	LEN, #2, 1f
108	ldr	TMP_Xw, [SRC0, #-4]!
109	str	TMP_Xw, [DST, #-4]!
1101:
111	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112	tbz	LEN, #1, 1f
113	ldrh	TMP_Xw, [SRC0, #-2]!
114	strh	TMP_Xw, [DST, #-2]!
1151:
116	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117	tbz	LEN, #0, 1f
118	ldrb	TMP_Xw, [SRC0, #-1]!
119	strb	TMP_Xw, [DST, #-1]!
1201:
121	ret
1229:
123
124	cmp	LEN, #16
125	bcs	9f
126
127	/* 8 <= len < 16 */
128	/* *--(uint64_t *)dst = *--(uint64_t *)src; */
129	ldr	TMP_X, [SRC0, #-8]!
130	str	TMP_X, [DST, #-8]!
131	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132	tbz	LEN, #2, 1f
133	ldr	TMP_Xw, [SRC0, #-4]!
134	str	TMP_Xw, [DST, #-4]!
1351:
136	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137	tbz	LEN, #1, 1f
138	ldrh	TMP_Xw, [SRC0, #-2]!
139	strh	TMP_Xw, [DST, #-2]!
1401:
141	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142	tbz	LEN, #0, 1f
143	ldrb	TMP_Xw, [SRC0, #-1]!
144	strb	TMP_Xw, [DST, #-1]!
1451:
146	ret
1479:
148
149	/* 16 <= len < 32 */
150	ldp	DATA0, DATA1, [SRC0, #-16]!
151	stp	DATA0, DATA1, [DST, #-16]!
152	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153	tbz	LEN, #3, 1f
154	ldr	TMP_X, [SRC0, #-8]!
155	str	TMP_X, [DST, #-8]!
1561:
157	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158	tbz	LEN, #2, 1f
159	ldr	TMP_Xw, [SRC0, #-4]!
160	str	TMP_Xw, [DST, #-4]!
1611:
162	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163	tbz	LEN, #1, 1f
164	ldrh	TMP_Xw, [SRC0, #-2]!
165	strh	TMP_Xw, [DST, #-2]!
1661:
167	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168	tbz	LEN, #0, 1f
169	ldrb	TMP_Xw, [SRC0, #-1]!
170	strb	TMP_Xw, [DST, #-1]!
1711:
172	ret
173#endif /* !STRICT_ALIGNMENT */
174
175	.align	4
176copy_backward:
177	/* DST is not aligned at this point */
178#ifndef STRICT_ALIGNMENT
179	cmp	LEN, #512	/* pre-alignment can be overhead when small */
180	bcc	9f
181#endif
182	/* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183	tbz	DST, #0, 1f
184	ldrb	TMP_Xw, [SRC0, #-1]!
185	strb	TMP_Xw, [DST, #-1]!
186	sub	LEN, LEN, #1
1871:
188	/* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189	tbz	DST, #1, 1f
190	ldrh	TMP_Xw, [SRC0, #-2]!
191	strh	TMP_Xw, [DST, #-2]!
192	sub	LEN, LEN, #2
1931:
194	/* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195	tbz	DST, #2, 1f
196	ldr	TMP_Xw, [SRC0, #-4]!
197	str	TMP_Xw, [DST, #-4]!
198	sub	LEN, LEN, #4
1991:
200#if (STP_ALIGN > 8)
201	/* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202	tbz	DST, #3, 1f
203	ldr	TMP_X, [SRC0, #-8]!
204	str	TMP_X, [DST, #-8]!
205	sub	LEN, LEN, #8
2061:
207#endif /* (STP_ALIGN > 8) */
2089:
209
210backward_copy1k:
211	/* while (len >= 1024) */
212	/* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
213	cmp	LEN, #1024
214	blo	9f
2151:
216	sub	LEN, LEN, #1024
217	.rept	(1024 / 16)
218	ldp	DATA0, DATA1, [SRC0, #-16]!	/* *--dst = *--src; */
219	stp	DATA0, DATA1, [DST, #-16]!
220	.endr
221	cmp	LEN, #1024
222	bhs	1b
2239:
224
225	/* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
226	tbz	LEN, #9, 1f
227	.rept	(512 / 16)
228	ldp	DATA0, DATA1, [SRC0, #-16]!
229	stp	DATA0, DATA1, [DST, #-16]!
230	.endr
2311:
232	/* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
233	tbz	LEN, #8, 1f
234	.rept	(256 / 16)
235	ldp	DATA0, DATA1, [SRC0, #-16]!
236	stp	DATA0, DATA1, [DST, #-16]!
237	.endr
2381:
239	/* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
240	tbz	LEN, #7, 1f
241	.rept	(128 / 16)
242	ldp	DATA0, DATA1, [SRC0, #-16]!
243	stp	DATA0, DATA1, [DST, #-16]!
244	.endr
2451:
246	/* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
247	tbz	LEN, #6, 1f
248	.rept	(64 / 16)
249	ldp	DATA0, DATA1, [SRC0, #-16]!
250	stp	DATA0, DATA1, [DST, #-16]!
251	.endr
2521:
253	/* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
254	tbz	LEN, #5, 1f
255	.rept	(32 / 16)
256	ldp	DATA0, DATA1, [SRC0, #-16]!
257	stp	DATA0, DATA1, [DST, #-16]!
258	.endr
2591:
260	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
261	tbz	LEN, #4, 1f
262	ldp	DATA0, DATA1, [SRC0, #-16]!
263	stp	DATA0, DATA1, [DST, #-16]!
2641:
265	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
266	tbz	LEN, #3, 1f
267	ldr	TMP_X, [SRC0, #-8]!
268	str	TMP_X, [DST, #-8]!
2691:
270	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
271	tbz	LEN, #2, 1f
272	ldr	TMP_Xw, [SRC0, #-4]!
273	str	TMP_Xw, [DST, #-4]!
2741:
275	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
276	tbz	LEN, #1, 1f
277	ldrh	TMP_Xw, [SRC0, #-2]!
278	strh	TMP_Xw, [DST, #-2]!
2791:
280	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
281	tbz	LEN, #0, 1f
282	ldrb	TMP_Xw, [SRC0, #-1]!
283	strb	TMP_Xw, [DST, #-1]!
2841:
285	ret
286#endif /* !NO_OVERLAP */
287
288
289#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
290	.align	5
291backward_copy:
292	prfm	PLDL1KEEP, [SRC0]
293	add	DST, DST0, LEN
294	add	SRC0, SRC0, LEN
295	cmp	LEN, #SMALLSIZE
296	bcs	strict_backward
297
298	cmp	LEN, #10
299	bcs	9f
300backward_tiny:
301	/* copy 1-10 bytes */
3021:	sub	LEN, LEN, #1
303	ldrb	TMP_Xw, [SRC0, #-1]!
304	strb	TMP_Xw, [DST, #-1]!
305	cbz	LEN, 1b
306	ret
3079:
308	/* length is small(<32), and src or dst may be unaligned */
309	eor	TMP_X, SRC0, DST
310	ands	TMP_X, TMP_X, #7
311	bne	notaligned_backward_small
312
313samealign_backward_small:
314	/* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
315	tbz	DST, #0, 1f
316	ldrb	TMP_Xw, [SRC0, #-1]!
317	strb	TMP_Xw, [DST, #-1]!
318	sub	LEN, LEN, #1
3191:
320	/* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
321	tbz	DST, #1, 1f
322	ldrh	TMP_Xw, [SRC0, #-2]!
323	strh	TMP_Xw, [DST, #-2]!
324	sub	LEN, LEN, #2
3251:
326	/* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
327	tbz	DST, #2, 1f
328	ldr	TMP_Xw, [SRC0, #-4]!
329	str	TMP_Xw, [DST, #-4]!
330	sub	LEN, LEN, #4
3311:
332	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
333	tbz	LEN, #4, 1f
334	ldp	DATA0, DATA1, [SRC0, #-16]!
335	stp	DATA0, DATA1, [DST, #-16]!
3361:
337	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
338	tbz	LEN, #3, 1f
339	ldr	TMP_X, [SRC0, #-8]!
340	str	TMP_X, [DST, #-8]!
3411:
342	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
343	tbz	LEN, #2, 1f
344	ldr	TMP_Xw, [SRC0, #-4]!
345	str	TMP_Xw, [DST, #-4]!
3461:
347	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
348	tbz	LEN, #1, 1f
349	ldrh	TMP_Xw, [SRC0, #-2]!
350	strh	TMP_Xw, [DST, #-2]!
3511:
352	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
353	tbz	LEN, #0, 1f
354	ldrb	TMP_Xw, [SRC0, #-1]!
355	strb	TMP_Xw, [DST, #-1]!
3561:
357	ret
358
359notaligned_backward_small:
360	/* length is small, and src or dst may be unaligned */
361	sub	TMP_S, SRC0, LEN	/* tmp_s = src - len */
3621:					/* do { */
363	ldrb	TMP_Xw, [SRC0, #-1]!
364	strb	TMP_Xw, [DST, #-1]!	/*  *(char *)dst++ = *(char *)src++ */
365	cmp	TMP_S, SRC0		/* while (tmp_s < src) */
366	blo	1b
367	ret
368
369strict_backward:
370	/* src or dst may be unaligned */
371	and	SRC_ALIGNBIT, SRC0, #7
372	and	DST_ALIGNBIT, DST, #7
373	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
374	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
375	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
376	cbz	SRC_DST_ALIGNBIT, copy_backward	/* same alignment? */
377
378	and	SRC, SRC0, #~7
379	and	DST, DST, #~7
380	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
381
382#if BYTE_ORDER == LITTLE_ENDIAN
383	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */
384
385	cmp	SRC, SRC0			/* don't access out of range */
386	beq	1f
387	ldr	DATA1, [SRC]
3881:
389	ldr	DATA0, [SRC, #-8]!
390
391	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
392	lsr	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1<<dst_src_alignbit)| */
393	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */
394
395	b	9f				/* }                          */
3965:						/* else {                     */
397	ldr	DATA0, [SRC]			/*  data0 = *src;             */
398	lsr	DATA1, DATA0, SRC_DST_ALIGNBIT	/*  data1=data0>>src_dst_abit;*/
3999:						/* }                          */
400
401	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
402	mov	TMP_D, DST		/*   tmp_d = dst;                     */
403
404	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
405	str	DATA1w, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = data1; */
406	lsr	DATA1, DATA1, #32	/*      data1 >>= 32;                 */
4071:					/*    }                               */
408	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
409	strh	DATA1w, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = data1; */
410	lsr	DATA1, DATA1, #16	/*      data1 >>= 16;                 */
4111:					/*    }                               */
412	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
413	strb	DATA1w, [TMP_D]		/*      *(uint8_t *)tmp_d = data1;    */
4141:					/*    }                               */
415
416	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
4179:					/* }                                  */
418#else /* BYTE_ORDER */
419	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */
420
421	cmp	SRC, SRC0			/* don't access out of range */
422	beq	1f
423	ldr	DATA1, [SRC]
4241:
425	ldr	DATA0, [SRC, #-8]!
426
427	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
428	lsl	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1>>dst_src_alignbit)| */
429	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */
430
431	b	9f				/* }                          */
4325:						/* else {                     */
433	ldr	DATA0, [SRC]			/*  data0 = *src;             */
434	lsr	DATA1, DATA0, DST_SRC_ALIGNBIT	/*  data1=data0<<dst_src_abit;*/
4359:						/* }                          */
436
437	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
438	mov	TMP_D, DST		/*   tmp_d = dst;                     */
439
440	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
441	lsr	TMP_X, DATA1, #32	/*      x = data1 >> 32;              */
442	str	TMP_Xw, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = x;     */
4431:					/*    }                               */
444	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
445	lsr	TMP_X, DATA1, #16	/*      x = data1 >> 16;              */
446	strh	TMP_Xw, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = x;     */
4471:					/*    }                               */
448	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
449	lsr	TMP_X, DATA1, #8	/*      x = data1 >> 8;               */
450	strb	TMP_Xw, [TMP_D], #1	/*      *(uint8_t *)tmp_d++ = x;      */
4511:					/*    }                               */
452
453	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
4549:					/* }                                  */
455#endif /* BYTE_ORDER */
456
457
458backward_shifting_copy_loop:
459	ldp	DATA2, DATA1, [SRC, #-16]!
460#if BYTE_ORDER == LITTLE_ENDIAN
461	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
462	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
463	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
464	orr	DATA0, DATA0, TMP_X
465	/* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
466	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT
467	lsr	TMP_X, DATA2, SRC_DST_ALIGNBIT
468	orr	DATA1, DATA1, TMP_X
469#else /* BYTE_ORDER */
470	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
471	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
472	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
473	orr	DATA0, DATA0, TMP_X
474	/* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
475	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT
476	lsl	TMP_X, DATA2, SRC_DST_ALIGNBIT
477	orr	DATA1, DATA1, TMP_X
478#endif /* BYTE_ORDER */
479	stp	DATA1, DATA0, [DST, #-16]!
480	mov	DATA0, DATA2
481	sub	LEN, LEN, #16
482	cmp	LEN, #16
483	bhs	backward_shifting_copy_loop
484
485
486	/* write 8 bytes */
487	tbz	LEN, #3, 9f
488
489	ldr	DATA1, [SRC, #-8]!
490#if BYTE_ORDER == LITTLE_ENDIAN
491	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
492	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
493	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
494	orr	DATA0, DATA0, TMP_X
495#else /* BYTE_ORDER */
496	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
497	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
498	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
499	orr	DATA0, DATA0, TMP_X
500#endif /* BYTE_ORDER */
501	str	DATA0, [DST, #-8]!
502	mov	DATA0, DATA1
503	sub	LEN, LEN, #8
5049:
505
506	cbz	LEN, backward_shifting_copy_done
507
508	/* copy last 1-7 bytes */
509	and	TMP_X, SRC_DST_ALIGNBIT, #63
510	cmp	LEN, TMP_X, lsr #3
511	bls	1f
512	ldr	DATA1, [SRC, #-8]!	/* don't access out of range */
5131:
514
515#if BYTE_ORDER == LITTLE_ENDIAN
516	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
517	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
518	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
519	orr	DATA0, DATA0, TMP_X
520#else /* BYTE_ORDER */
521	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
522	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
523	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
524	orr	DATA0, DATA0, TMP_X
525#endif /* BYTE_ORDER */
526
527#if BYTE_ORDER == LITTLE_ENDIAN
528	tbz	LEN, #2, 1f
529	ror	DATA0, DATA0, #32
530	str	DATA0w, [DST, #-4]!
5311:
532	tbz	LEN, #1, 1f
533	ror	DATA0, DATA0, #48
534	strh	DATA0w, [DST, #-2]!
5351:
536	tbz	LEN, #0, 1f
537	ror	DATA0, DATA0, #56
538	strb	DATA0w, [DST, #-1]!
5391:
540#else /* BYTE_ORDER */
541	tbz	LEN, #2, 1f
542	str	DATA0w, [DST, #-4]!
543	lsr	DATA0, DATA0, #32
5441:
545	tbz	LEN, #1, 1f
546	strh	DATA0w, [DST, #-2]!
547	lsr	DATA0, DATA0, #16
5481:
549	tbz	LEN, #0, 1f
550	strb	DATA0w, [DST, #-1]!
5511:
552#endif /* BYTE_ORDER */
553backward_shifting_copy_done:
554	ret
555#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
556
557
558	.align	5
559ENTRY(FUNCTION)
560#ifdef STRICT_ALIGNMENT
561	cbz	LEN, done
562#ifndef NO_OVERLAP
563	cmp	SRC0, DST0
564	beq	done
565	bcc	backward_copy
566#endif /* NO_OVERLAP */
567	mov	DST, DST0
568	cmp	LEN, #SMALLSIZE
569	bcs	strict_forward
570
571	cmp	LEN, #10
572	bcs	9f
573forward_tiny:
574	/* copy 1-10 bytes */
5751:	sub	LEN, LEN, #1
576	ldrb	TMP_Xw, [SRC0], #1
577	strb	TMP_Xw, [DST], #1
578	cbz	LEN, 1b
579	ret
5809:
581	/* length is small(<32), and src or dst may be unaligned */
582	eor	TMP_X, SRC0, DST0
583	ands	TMP_X, TMP_X, #7
584	bne	notaligned_forward_small
585samealign_forward_small:
586	/* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
587	tbz	DST, #0, 1f
588	ldrb	TMP_Xw, [SRC0], #1
589	strb	TMP_Xw, [DST], #1
590	sub	LEN, LEN, #1
5911:
592	/* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
593	tbz	DST, #1, 1f
594	ldrh	TMP_Xw, [SRC0], #2
595	strh	TMP_Xw, [DST], #2
596	sub	LEN, LEN, #2
5971:
598	/* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
599	tbz	DST, #2, 1f
600	ldr	TMP_Xw, [SRC0], #4
601	str	TMP_Xw, [DST], #4
602	sub	LEN, LEN, #4
6031:
604	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
605	tbz	LEN, #4, 1f
606	ldp	DATA0, DATA1, [SRC0], #16
607	stp	DATA0, DATA1, [DST], #16
6081:
609	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
610	tbz	LEN, #3, 1f
611	ldr	TMP_X, [SRC0], #8
612	str	TMP_X, [DST], #8
6131:
614	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
615	tbz	LEN, #2, 1f
616	ldr	TMP_Xw, [SRC0], #4
617	str	TMP_Xw, [DST], #4
6181:
619	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
620	tbz	LEN, #1, 1f
621	ldrh	TMP_Xw, [SRC0], #2
622	strh	TMP_Xw, [DST], #2
6231:
624	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
625	tbz	LEN, #0, 1f
626	ldrb	TMP_Xw, [SRC0], #1
627	strb	TMP_Xw, [DST], #1
6281:
629	ret
630
631notaligned_forward_small:
632	/* src and dst are not aligned... */
633	prfm	PLDL1KEEP, [SRC0]
634	prfm	PLDL1KEEP, [SRC0, #8]
635	prfm	PLDL1KEEP, [SRC0, #16]
636	add	TMP_S, SRC0, LEN	/* tmp_s = src + len */
6371:					/* do { */
638	ldrb	TMP_Xw, [SRC0], #1
639	strb	TMP_Xw, [DST], #1	/*  *(char *)dst++ = *(char *)src++ */
640	cmp	SRC0, TMP_S		/* while (src < tmp_s); */
641	blo	1b
642	ret
643
644strict_forward:
645	/* src or dst may be unaligned */
646	and	SRC_ALIGNBIT, SRC0, #7
647	and	DST_ALIGNBIT, DST0, #7
648	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
649	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
650	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
651	cbz	SRC_DST_ALIGNBIT, copy_forward	/* same alignment? */
652
653	and	SRC, SRC0, #~7
654	and	DST, DST0, #~7
655	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
656
657#if BYTE_ORDER == LITTLE_ENDIAN
658	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
659	ldp	DATA1, DATA0, [SRC], #16
660	neg	TMP_X, SRC_ALIGNBIT
661	lsr	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
662	lsl	TMP_X, DATA0, TMP_X		/*  (data1 >> src_alignbit) | */
663	orr	DATA1, DATA1, TMP_X		/*  (data0 << -src_alignbit); */
664	b	9f
6655:
666	ldr	DATA0, [SRC], #8
667	lsr	DATA1, DATA0, SRC_ALIGNBIT
6689:
669
670	cbz	DST_ALIGNBIT, 5f
671	mov	TMP_D, DST0
672	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
673	tbz	TMP_D, #0, 1f
674	strb	DATA1w, [TMP_D], #1
675	lsr	DATA1, DATA1, #8
6761:
677	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
678	tbz	TMP_D, #1, 1f
679	strh	DATA1w, [TMP_D], #2
680	lsr	DATA1, DATA1, #16
6811:
682	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
683	tbz	TMP_D, #2, 1f
684	str	DATA1w, [TMP_D], #4
6851:
686	add	DST, DST, #8
687	b	9f
6885:
689	str	DATA1, [DST], #8
6909:
691	sub	LEN, LEN, #8
692	add	LEN, LEN, DST_ALIGNBIT, lsr #3
693#else /* BYTE_ORDER */
694	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
695	ldp	DATA1, DATA0, [SRC], #16
696	neg	TMP_X, SRC_ALIGNBIT
697	lsl	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
698	lsr	TMP_X, DATA0, TMP_X		/*  (data1 << src_alignbit) | */
699	orr	DATA1, DATA1, TMP_X		/*  (data0 >> -src_alignbit); */
700	b	9f
7015:
702	ldr	DATA0, [SRC], #8
703	lsl	DATA1, DATA0, SRC_ALIGNBIT
7049:
705
706	cbz	DST_ALIGNBIT, 5f
707	mov	TMP_D, DST0
708	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
709	tbz	TMP_D, #0, 1f
710	lsr	TMP_X, DATA1, #56
711	strb	TMP_Xw, [TMP_D], #1
7121:
713	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
714	tbz	TMP_D, #1, 1f
715	lsr	TMP_X, DATA1, #48
716	strh	TMP_Xw, [TMP_D], #2
7171:
718	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
719	tbz	TMP_D, #2, 1f
720	lsr	TMP_X, DATA1, #32
721	str	TMP_Xw, [TMP_D], #4
7221:
723	add	DST, DST, #8
724	b	9f
7255:
726	str	DATA1, [DST], #8
7279:
728	sub	LEN, LEN, #8
729	add	LEN, LEN, DST_ALIGNBIT, lsr #3
730#endif /* BYTE_ORDER */
731
732shifting_copy_loop:
733	ldp	DATA1, DATA2, [SRC], #16
734#if BYTE_ORDER == LITTLE_ENDIAN
735	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
736	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
737	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
738	orr	DATA0, DATA0, TMP_X
739	/* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
740	lsr	DATA1, DATA1, SRC_DST_ALIGNBIT
741	lsl	TMP_X, DATA2, DST_SRC_ALIGNBIT
742	orr	DATA1, DATA1, TMP_X
743#else /* BYTE_ORDER */
744	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
745	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
746	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
747	orr	DATA0, DATA0, TMP_X
748	/* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
749	lsl	DATA1, DATA1, SRC_DST_ALIGNBIT
750	lsr	TMP_X, DATA2, DST_SRC_ALIGNBIT
751	orr	DATA1, DATA1, TMP_X
752#endif /* BYTE_ORDER */
753	stp	DATA0, DATA1, [DST], #16
754	mov	DATA0, DATA2
755	sub	LEN, LEN, #16
756	cmp	LEN, #16
757	bhs	shifting_copy_loop
758
759
760	/* write 8 bytes */
761	tbz	LEN, #3, 9f
762	ldr	DATA1, [SRC], #8
763#if BYTE_ORDER == LITTLE_ENDIAN
764	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
765	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
766	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
767	orr	DATA0, DATA0, TMP_X
768#else /* BYTE_ORDER */
769	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
770	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
771	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
772	orr	DATA0, DATA0, TMP_X
773#endif /* BYTE_ORDER */
774	str	DATA0, [DST], #8
775	mov	DATA0, DATA1
776	sub	LEN, LEN, #8
7779:
778
779	cbz	LEN, shifting_copy_done
780
781	/* copy last 1-7 bytes */
782	and	TMP_X, DST_SRC_ALIGNBIT, #63
783	cmp	LEN, TMP_X, lsr #3
784	bls	1f
785	ldr	DATA1, [SRC], #8	/* don't access out of range */
7861:
787
788#if BYTE_ORDER == LITTLE_ENDIAN
789	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
790	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
791	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
792	orr	DATA0, DATA0, TMP_X
793#else /* BYTE_ORDER */
794	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
795	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
796	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
797	orr	DATA0, DATA0, TMP_X
798#endif /* BYTE_ORDER */
799
800#if BYTE_ORDER == LITTLE_ENDIAN
801	/* if (len & 4) { *(uint32_t *)dst++ = data0; } */
802	tbz	LEN, #2, 1f
803	str	DATA0w, [DST], #4
804	lsr	DATA0, DATA0, #32
8051:
806	/* if (len & 2) { *(uint16_t *)dst++ = data0; } */
807	tbz	LEN, #1, 1f
808	strh	DATA0w, [DST], #2
809	lsr	DATA0, DATA0, #16
8101:
811	/* if (len & 1) { *(uint8_t *)dst++ = data0; } */
812	tbz	LEN, #0, 1f
813	strb	DATA0w, [DST], #1
8141:
815#else /* BYTE_ORDER */
816	/* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
817	tbz	LEN, #2, 1f
818	lsr	TMP_X, DATA0, #32
819	str	TMP_Xw, [DST], #4
8201:
821	/* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
822	tbz	LEN, #1, 1f
823	lsr	TMP_X, DATA0, #16
824	strh	TMP_Xw, [DST], #2
8251:
826	/* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
827	tbz	LEN, #0, 1f
828	lsr	TMP_X, DATA0, #8
829	strb	TMP_Xw, [DST], #1
8301:
831#endif /* BYTE_ORDER */
832shifting_copy_done:
833	ret
834
835#else /* STRICT_ALIGNMENT */
836#ifndef NO_OVERLAP
837	cbz	LEN, done
838	cmp	SRC0, DST0
839	beq	done
840	bcc	backward_ignore_align
841#endif /* NO_OVERLAP */
842
843	prfm	PLDL1KEEP, [SRC0]
844	cmp	LEN, #SMALLSIZE
845	bcs	copy_forward
846	mov	DST, DST0
847
848copy_forward_small:
849	cmp	LEN, #8
850	bcs	9f
851
852	/* 0 <= len < 8 */
853	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
854	tbz	LEN, #2, 1f
855	ldr	TMP_Xw, [SRC0], #4
856	str	TMP_Xw, [DST], #4
8571:
858	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
859	tbz	LEN, #1, 1f
860	ldrh	TMP_Xw, [SRC0], #2
861	strh	TMP_Xw, [DST], #2
8621:
863	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
864	tbz	LEN, #0, 1f
865	ldrb	TMP_Xw, [SRC0], #1
866	strb	TMP_Xw, [DST], #1
8671:
868	ret
8699:
870
871	prfm	PLDL1KEEP, [SRC0, #8]
872	cmp	LEN, #16
873	bcs	9f
874
875	/* 8 <= len < 16 */
876	/* *(uint64_t *)dst++ = *(uint64_t *)src++; */
877	ldr	TMP_X, [SRC0], #8
878	str	TMP_X, [DST], #8
879	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
880	tbz	LEN, #2, 1f
881	ldr	TMP_Xw, [SRC0], #4
882	str	TMP_Xw, [DST], #4
8831:
884	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
885	tbz	LEN, #1, 1f
886	ldrh	TMP_Xw, [SRC0], #2
887	strh	TMP_Xw, [DST], #2
8881:
889	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
890	tbz	LEN, #0, 1f
891	ldrb	TMP_Xw, [SRC0], #1
892	strb	TMP_Xw, [DST], #1
8931:
894	ret
8959:
896
897	/* 16 <= len < 32 */
898	prfm	PLDL1KEEP, [SRC0, 16]
899	prfm	PLDL1KEEP, [SRC0, 24]
900	ldp	DATA0, DATA1, [SRC0], #16
901	stp	DATA0, DATA1, [DST], #16
902	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
903	tbz	LEN, #3, 1f
904	ldr	TMP_X, [SRC0], #8
905	str	TMP_X, [DST], #8
9061:
907	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
908	tbz	LEN, #2, 1f
909	ldr	TMP_Xw, [SRC0], #4
910	str	TMP_Xw, [DST], #4
9111:
912	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
913	tbz	LEN, #1, 1f
914	ldrh	TMP_Xw, [SRC0], #2
915	strh	TMP_Xw, [DST], #2
9161:
917	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
918	tbz	LEN, #0, 1f
919	ldrb	TMP_Xw, [SRC0], #1
920	strb	TMP_Xw, [DST], #1
9211:
922	ret
923#endif /* !STRICT_ALIGNMENT */
924
925	.align	4
926copy_forward:
927	/* DST is not aligned at this point */
928	mov	DST, DST0
929#ifndef STRICT_ALIGNMENT
930	cmp	LEN, #512	/* pre-alignment can be overhead when small */
931	bcc	9f
932#endif /* STRICT_ALIGNMENT */
933	/* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
934	tbz	DST, #0, 1f
935	ldrb	TMP_Xw, [SRC0], #1
936	strb	TMP_Xw, [DST], #1
937	sub	LEN, LEN, #1
9381:
939	/* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
940	tbz	DST, #1, 1f
941	ldrh	TMP_Xw, [SRC0], #2
942	strh	TMP_Xw, [DST], #2
943	sub	LEN, LEN, #2
9441:
945	/* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
946	tbz	DST, #2, 1f
947	ldr	TMP_Xw, [SRC0], #4
948	str	TMP_Xw, [DST], #4
949	sub	LEN, LEN, #4
9501:
951#if (STP_ALIGN > 8)
952	/* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
953	tbz	DST, #3, 1f
954	ldr	TMP_X, [SRC0], #8
955	str	TMP_X, [DST], #8
956	sub	LEN, LEN, #8
9571:
958#endif /* (STP_ALIGN > 8) */
9599:
960
961forward_copy1k:
962	/* while (len >= 1024) */
963	/* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
964	cmp	LEN, #1024
965	blo	9f
9661:
967	sub	LEN, LEN, #1024
968	.rept	(1024 / 16)
969	ldp	DATA0, DATA1, [SRC0], #16	/* *dst++ = *src++; */
970	stp	DATA0, DATA1, [DST], #16
971	.endr
972	cmp	LEN, #1024
973	bhs	1b
9749:
975
976	/* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
977	tbz	LEN, #9, 1f
978	.rept	(512 / 16)
979	ldp	DATA0, DATA1, [SRC0], #16
980	stp	DATA0, DATA1, [DST], #16
981	.endr
9821:
983	/* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
984	tbz	LEN, #8, 1f
985	.rept	(256 / 16)
986	ldp	DATA0, DATA1, [SRC0], #16
987	stp	DATA0, DATA1, [DST], #16
988	.endr
9891:
990	/* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
991	tbz	LEN, #7, 1f
992	.rept	(128 / 16)
993	ldp	DATA0, DATA1, [SRC0], #16
994	stp	DATA0, DATA1, [DST], #16
995	.endr
9961:
997	/* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
998	tbz	LEN, #6, 1f
999	.rept	(64 / 16)
1000	ldp	DATA0, DATA1, [SRC0], #16
1001	stp	DATA0, DATA1, [DST], #16
1002	.endr
10031:
1004	/* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
1005	tbz	LEN, #5, 1f
1006	.rept	(32 / 16)
1007	ldp	DATA0, DATA1, [SRC0], #16
1008	stp	DATA0, DATA1, [DST], #16
1009	.endr
10101:
1011	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
1012	tbz	LEN, #4, 1f
1013	ldp	DATA0, DATA1, [SRC0], #16
1014	stp	DATA0, DATA1, [DST], #16
10151:
1016	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
1017	tbz	LEN, #3, 1f
1018	ldr	TMP_X, [SRC0], #8
1019	str	TMP_X, [DST], #8
10201:
1021	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
1022	tbz	LEN, #2, 1f
1023	ldr	TMP_Xw, [SRC0], #4
1024	str	TMP_Xw, [DST], #4
10251:
1026	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
1027	tbz	LEN, #1, 1f
1028	ldrh	TMP_Xw, [SRC0], #2
1029	strh	TMP_Xw, [DST], #2
10301:
1031	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
1032	tbz	LEN, #0, 1f
1033	ldrb	TMP_Xw, [SRC0], #1
1034	strb	TMP_Xw, [DST], #1
10351:
1036done:
1037	ret
1038END(FUNCTION)
1039