xref: /netbsd-src/common/lib/libc/arch/aarch64/string/bcopy.S (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1/* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */
2
3/*
4 * Copyright (c) 2018 Ryo Shimizu <ryo@nerv.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <machine/asm.h>
30
31#if defined(LIBC_SCCS)
32RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $")
33#endif
34
35#if defined(MEMCOPY)
36
37/*
38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 */
40#define FUNCTION		memcpy
41#define NO_OVERLAP
42#define SRC0			x1
43#define DST0			x0
44#define LEN			x2
45
46#elif defined(MEMMOVE)
47
48/*
49 * void *memmove(void *dst, const void *src, size_t len);
50 */
51#define FUNCTION		memmove
52#undef NO_OVERLAP
53#define SRC0			x1
54#define DST0			x0
55#define LEN			x2
56
57#else /* !MEMCOPY && !MEMMOVE */
58
59/*
60 * void bcopy(const void *src, void *dst, size_t len);
61 */
62#define FUNCTION		bcopy
63#define NO_OVERLAP
64#define SRC0			x0
65#define DST0			x1
66#define LEN			x2
67
68#endif /* MEMCOPY/MEMMOVE/BCOPY */
69
70/* caller-saved temporary registers. breakable. */
71#define TMP_X			x3
72#define TMP_Xw			w3
73#define TMP_D			x4
74#define TMP_S			x5
75#define DST			x6
76#define SRC			x7
77#define DATA0			x8
78#define DATA0w			w8
79#define DATA1			x9
80#define DATA1w			w9
81#define DATA2			x10
82#define SRC_ALIGNBIT		x11	/* (SRC & 7) * 8 */
83#define DST_ALIGNBIT		x12	/* (DST & 7) * 8 */
84#define SRC_DST_ALIGNBIT	x13	/* = SRC_ALIGNBIT - DST_ALIGNBIT */
85#define DST_SRC_ALIGNBIT	x14	/* = -SRC_DST_ALIGNBIT */
86
87#define STP_ALIGN		16	/* align before stp/ldp. 8 or 16 */
88#define SMALLSIZE		32
89
90	.text
91	.align	5
92
93#ifndef NO_OVERLAP
94#ifndef STRICT_ALIGNMENT
95backward_ignore_align:
96	prfm	PLDL1KEEP, [SRC0]
97	add	SRC0, SRC0, LEN
98	add	DST, DST0, LEN
99	cmp	LEN, #SMALLSIZE
100	bcs	copy_backward
101copy_backward_small:
102	cmp	LEN, #8
103	bcs	9f
104
105	/* 0 <= len < 8 */
106	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107	tbz	LEN, #2, 1f
108	ldr	TMP_Xw, [SRC0, #-4]!
109	str	TMP_Xw, [DST, #-4]!
1101:
111	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112	tbz	LEN, #1, 1f
113	ldrh	TMP_Xw, [SRC0, #-2]!
114	strh	TMP_Xw, [DST, #-2]!
1151:
116	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117	tbz	LEN, #0, 1f
118	ldrb	TMP_Xw, [SRC0, #-1]!
119	strb	TMP_Xw, [DST, #-1]!
1201:
121	ret
1229:
123
124	cmp	LEN, #16
125	bcs	9f
126
127	/* 8 <= len < 16 */
128	/* *--(uint64_t *)dst = *--(uint64_t *)src; */
129	ldr	TMP_X, [SRC0, #-8]!
130	str	TMP_X, [DST, #-8]!
131	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132	tbz	LEN, #2, 1f
133	ldr	TMP_Xw, [SRC0, #-4]!
134	str	TMP_Xw, [DST, #-4]!
1351:
136	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137	tbz	LEN, #1, 1f
138	ldrh	TMP_Xw, [SRC0, #-2]!
139	strh	TMP_Xw, [DST, #-2]!
1401:
141	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142	tbz	LEN, #0, 1f
143	ldrb	TMP_Xw, [SRC0, #-1]!
144	strb	TMP_Xw, [DST, #-1]!
1451:
146	ret
1479:
148
149	/* 16 <= len < 32 */
150	ldp	DATA0, DATA1, [SRC0, #-16]!
151	stp	DATA0, DATA1, [DST, #-16]!
152	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153	tbz	LEN, #3, 1f
154	ldr	TMP_X, [SRC0, #-8]!
155	str	TMP_X, [DST, #-8]!
1561:
157	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158	tbz	LEN, #2, 1f
159	ldr	TMP_Xw, [SRC0, #-4]!
160	str	TMP_Xw, [DST, #-4]!
1611:
162	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163	tbz	LEN, #1, 1f
164	ldrh	TMP_Xw, [SRC0, #-2]!
165	strh	TMP_Xw, [DST, #-2]!
1661:
167	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168	tbz	LEN, #0, 1f
169	ldrb	TMP_Xw, [SRC0, #-1]!
170	strb	TMP_Xw, [DST, #-1]!
1711:
172	ret
173#endif /* !STRICT_ALIGNMENT */
174
175	.align	4
176copy_backward:
177	/* DST is not aligned at this point */
178#ifndef STRICT_ALIGNMENT
179	cmp	LEN, #512	/* pre-alignment can be overhead when small */
180	bcc	9f
181#endif
182	/* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183	tbz	DST, #0, 1f
184	ldrb	TMP_Xw, [SRC0, #-1]!
185	strb	TMP_Xw, [DST, #-1]!
186	sub	LEN, LEN, #1
1871:
188	/* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189	tbz	DST, #1, 1f
190	ldrh	TMP_Xw, [SRC0, #-2]!
191	strh	TMP_Xw, [DST, #-2]!
192	sub	LEN, LEN, #2
1931:
194	/* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195	tbz	DST, #2, 1f
196	ldr	TMP_Xw, [SRC0, #-4]!
197	str	TMP_Xw, [DST, #-4]!
198	sub	LEN, LEN, #4
1991:
200#if (STP_ALIGN > 8)
201	/* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202	tbz	DST, #3, 1f
203	ldr	TMP_X, [SRC0, #-8]!
204	str	TMP_X, [DST, #-8]!
205	sub	LEN, LEN, #8
2061:
207#endif /* (STP_ALIGN > 8) */
2089:
209
210	cmp	LEN, #1024
211	bhs	backward_copy1k
212backward_less1k:
213	/* copy 16*n bytes */
214	and	TMP_D, LEN, #(1023-15)		/* len &= 1023; len &= ~15; */
215	adr	TMP_X, 8f
216	sub	LEN, LEN, TMP_D
217	sub	TMP_X, TMP_X, TMP_D, lsr #1	/* jump to (8f - len/2) */
218	br	TMP_X
219backward_copy1k:	/* copy 16*64 bytes */
220	sub	LEN, LEN, #1024
221	.rept	(1024 / 16)
222	ldp	DATA0, DATA1, [SRC0, #-16]!	/* *--dst = *--src; */
223	stp	DATA0, DATA1, [DST, #-16]!
224	.endr
2258:
226	cbz	LEN, done
227	cmp	LEN, #1024
228	bhs	backward_copy1k
229	cmp	LEN, #16
230	bhs	backward_less1k
231
232	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
233	tbz	LEN, #4, 1f
234	ldp	DATA0, DATA1, [SRC0, #-16]!
235	ldp	DATA0, DATA1, [DST, #-16]!
2361:
237	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
238	tbz	LEN, #3, 1f
239	ldr	TMP_X, [SRC0, #-8]!
240	str	TMP_X, [DST, #-8]!
2411:
242	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
243	tbz	LEN, #2, 1f
244	ldr	TMP_Xw, [SRC0, #-4]!
245	str	TMP_Xw, [DST, #-4]!
2461:
247	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
248	tbz	LEN, #1, 1f
249	ldrh	TMP_Xw, [SRC0, #-2]!
250	strh	TMP_Xw, [DST, #-2]!
2511:
252	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
253	tbz	LEN, #0, 1f
254	ldrb	TMP_Xw, [SRC0, #-1]!
255	strb	TMP_Xw, [DST, #-1]!
2561:
257	ret
258#endif /* !NO_OVERLAP */
259
260
261#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
262	.align	5
263backward_copy:
264	prfm	PLDL1KEEP, [SRC0]
265	add	DST, DST0, LEN
266	add	SRC0, SRC0, LEN
267	cmp	LEN, #SMALLSIZE
268	bcs	strict_backward
269
270	cmp	LEN, #10
271	bcs	9f
272backward_tiny:
273	/* copy 1-10 bytes */
274	adr	TMP_X, 8f
275	sub	TMP_X, TMP_X, LEN, lsl #3	/* jump to (8f - len*2) */
276	br	TMP_X
277	.rept	10
278	ldrb	TMP_Xw, [SRC0, #-1]!
279	strb	TMP_Xw, [DST, #-1]!
280	.endr
2818:
282	ret
2839:
284	/* length is small(<32), and src or dst may be unaligned */
285	eor	TMP_X, SRC0, DST0
286	ands	TMP_X, TMP_X, #7
287	bne	notaligned_backward_small
288
289samealign_backward_small:
290	/* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
291	tbz	DST, #0, 1f
292	ldrb	TMP_Xw, [SRC0, #-1]!
293	strb	TMP_Xw, [DST, #-1]!
294	sub	LEN, LEN, #1
2951:
296	/* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
297	tbz	DST, #1, 1f
298	ldrh	TMP_Xw, [SRC0, #-2]!
299	strh	TMP_Xw, [DST, #-2]!
300	sub	LEN, LEN, #2
3011:
302	/* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
303	tbz	DST, #2, 1f
304	ldr	TMP_Xw, [SRC0, #-4]!
305	str	TMP_Xw, [DST, #-4]!
306	sub	LEN, LEN, #4
3071:
308	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
309	tbz	LEN, #4, 1f
310	ldp	DATA0, DATA1, [SRC0, #-16]!
311	stp	DATA0, DATA1, [DST, #-16]!
3121:
313	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
314	tbz	LEN, #3, 1f
315	ldr	TMP_X, [SRC0, #-8]!
316	str	TMP_X, [DST, #-8]!
3171:
318	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
319	tbz	LEN, #2, 1f
320	ldr	TMP_Xw, [SRC0, #-4]!
321	str	TMP_Xw, [DST, #-4]!
3221:
323	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
324	tbz	LEN, #1, 1f
325	ldrh	TMP_Xw, [SRC0, #-2]!
326	strh	TMP_Xw, [DST, #-2]!
3271:
328	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
329	tbz	LEN, #0, 1f
330	ldrb	TMP_Xw, [SRC0, #-1]!
331	strb	TMP_Xw, [DST, #-1]!
3321:
333	ret
334
335notaligned_backward_small:
336	/* length is small, and src or dst may be unaligned */
337	sub	TMP_S, SRC0, LEN	/* tmp_s = src - len */
3381:					/* do { */
339	ldrb	TMP_Xw, [SRC0, #-1]!
340	strb	TMP_Xw, [DST, #-1]!	/*  *(char *)dst++ = *(char *)src++ */
341	cmp	TMP_S, SRC0		/* while (tmp_s < src) */
342	blo	1b
343	ret
344
345strict_backward:
346	/* src or dst may be unaligned */
347	and	SRC_ALIGNBIT, SRC0, #7
348	and	DST_ALIGNBIT, DST, #7
349	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
350	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
351	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
352	cbz	SRC_DST_ALIGNBIT, copy_backward	/* same alignment? */
353
354	and	SRC, SRC0, #~7
355	and	DST, DST, #~7
356	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
357
358#if BYTE_ORDER == LITTLE_ENDIAN
359	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */
360
361	cmp	SRC, SRC0			/* don't access out of range */
362	beq	1f
363	ldr	DATA1, [SRC]
3641:
365	ldr	DATA0, [SRC, #-8]!
366
367	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
368	lsr	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1<<dst_src_alignbit)| */
369	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */
370
371	b	9f				/* }                          */
3725:						/* else {                     */
373	ldr	DATA0, [SRC]			/*  data0 = *src;             */
374	lsr	DATA1, DATA0, SRC_DST_ALIGNBIT	/*  data1=data0>>src_dst_abit;*/
3759:						/* }                          */
376
377	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
378	mov	TMP_D, DST		/*   tmp_d = dst;                     */
379
380	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
381	str	DATA1w, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = data1; */
382	lsr	DATA1, DATA1, #32	/*      data1 >>= 32;                 */
3831:					/*    }                               */
384	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
385	strh	DATA1w, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = data1; */
386	lsr	DATA1, DATA1, #16	/*      data1 >>= 16;                 */
3871:					/*    }                               */
388	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
389	strb	DATA1w, [TMP_D]		/*      *(uint8_t *)tmp_d = data1;    */
3901:					/*    }                               */
391
392	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
3939:					/* }                                  */
394#else /* BYTE_ORDER */
395	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */
396
397	cmp	SRC, SRC0			/* don't access out of range */
398	beq	1f
399	ldr	DATA1, [SRC]
4001:
401	ldr	DATA0, [SRC, #-8]!
402
403	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
404	lsl	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1>>dst_src_alignbit)| */
405	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */
406
407	b	9f				/* }                          */
4085:						/* else {                     */
409	ldr	DATA0, [SRC]			/*  data0 = *src;             */
410	lsr	DATA1, DATA0, DST_SRC_ALIGNBIT	/*  data1=data0<<dst_src_abit;*/
4119:						/* }                          */
412
413	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
414	mov	TMP_D, DST		/*   tmp_d = dst;                     */
415
416	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
417	lsr	TMP_X, DATA1, #32	/*      x = data1 >> 32;              */
418	str	TMP_Xw, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = x;     */
4191:					/*    }                               */
420	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
421	lsr	TMP_X, DATA1, #16	/*      x = data1 >> 16;              */
422	strh	TMP_Xw, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = x;     */
4231:					/*    }                               */
424	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
425	lsr	TMP_X, DATA1, #8	/*      x = data1 >> 8;               */
426	strb	TMP_Xw, [TMP_D], #1	/*      *(uint8_t *)tmp_d++ = x;      */
4271:					/*    }                               */
428
429	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
4309:					/* }                                  */
431#endif /* BYTE_ORDER */
432
433
434backward_shifting_copy_loop:
435	ldp	DATA2, DATA1, [SRC, #-16]!
436#if BYTE_ORDER == LITTLE_ENDIAN
437	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
438	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
439	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
440	orr	DATA0, DATA0, TMP_X
441	/* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
442	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT
443	lsr	TMP_X, DATA2, SRC_DST_ALIGNBIT
444	orr	DATA1, DATA1, TMP_X
445#else /* BYTE_ORDER */
446	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
447	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
448	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
449	orr	DATA0, DATA0, TMP_X
450	/* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
451	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT
452	lsl	TMP_X, DATA2, SRC_DST_ALIGNBIT
453	orr	DATA1, DATA1, TMP_X
454#endif /* BYTE_ORDER */
455	stp	DATA1, DATA0, [DST, #-16]!
456	mov	DATA0, DATA2
457	sub	LEN, LEN, #16
458	cmp	LEN, #16
459	bhs	backward_shifting_copy_loop
460
461
462	/* write 8 bytes */
463	tbz	LEN, #3, 9f
464
465	ldr	DATA1, [SRC, #-8]!
466#if BYTE_ORDER == LITTLE_ENDIAN
467	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
468	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
469	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
470	orr	DATA0, DATA0, TMP_X
471#else /* BYTE_ORDER */
472	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
473	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
474	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
475	orr	DATA0, DATA0, TMP_X
476#endif /* BYTE_ORDER */
477	str	DATA0, [DST, #-8]!
478	mov	DATA0, DATA1
479	sub	LEN, LEN, #8
4809:
481
482	cbz	LEN, backward_shifting_copy_done
483
484	/* copy last 1-7 bytes */
485	and	TMP_X, SRC_DST_ALIGNBIT, #63
486	cmp	LEN, TMP_X, lsr #3
487	bls	1f
488	ldr	DATA1, [SRC, #-8]!	/* don't access out of range */
4891:
490
491#if BYTE_ORDER == LITTLE_ENDIAN
492	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
493	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
494	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
495	orr	DATA0, DATA0, TMP_X
496#else /* BYTE_ORDER */
497	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
498	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
499	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
500	orr	DATA0, DATA0, TMP_X
501#endif /* BYTE_ORDER */
502
503#if BYTE_ORDER == LITTLE_ENDIAN
504	tbz	LEN, #2, 1f
505	ror	DATA0, DATA0, #32
506	str	DATA0w, [DST, #-4]!
5071:
508	tbz	LEN, #1, 1f
509	ror	DATA0, DATA0, #48
510	strh	DATA0w, [DST, #-2]!
5111:
512	tbz	LEN, #0, 1f
513	ror	DATA0, DATA0, #56
514	strb	DATA0w, [DST, #-1]!
5151:
516#else /* BYTE_ORDER */
517	tbz	LEN, #2, 1f
518	str	DATA0w, [DST, #-4]!
519	lsr	DATA0, DATA0, #32
5201:
521	tbz	LEN, #1, 1f
522	strh	DATA0w, [DST, #-2]!
523	lsr	DATA0, DATA0, #16
5241:
525	tbz	LEN, #0, 1f
526	strb	DATA0w, [DST, #-1]!
5271:
528#endif /* BYTE_ORDER */
529backward_shifting_copy_done:
530	ret
531#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
532
533
534	.align	5
535ENTRY(FUNCTION)
536#ifdef STRICT_ALIGNMENT
537	cbz	LEN, done
538#ifndef NO_OVERLAP
539	cmp	SRC0, DST0
540	beq	done
541	bcc	backward_copy
542#endif /* NO_OVERLAP */
543	mov	DST, DST0
544	cmp	LEN, #SMALLSIZE
545	bcs	strict_forward
546
547	cmp	LEN, #10
548	bcs	9f
549forward_tiny:
550	/* copy 1-10 bytes */
551	adr	TMP_X, 8f
552	sub	TMP_X, TMP_X, LEN, lsl #3	/* jump to (8f - len*2) */
553	br	TMP_X
554	.rept	10
555	ldrb	TMP_Xw, [SRC0], #1
556	strb	TMP_Xw, [DST], #1
557	.endr
5588:
559	ret
5609:
561	/* length is small(<32), and src or dst may be unaligned */
562	eor	TMP_X, SRC0, DST0
563	ands	TMP_X, TMP_X, #7
564	bne	notaligned_forward_small
565samealign_forward_small:
566	/* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
567	tbz	DST, #0, 1f
568	ldrb	TMP_Xw, [SRC0], #1
569	strb	TMP_Xw, [DST], #1
570	sub	LEN, LEN, #1
5711:
572	/* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
573	tbz	DST, #1, 1f
574	ldrh	TMP_Xw, [SRC0], #2
575	strh	TMP_Xw, [DST], #2
576	sub	LEN, LEN, #2
5771:
578	/* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
579	tbz	DST, #2, 1f
580	ldr	TMP_Xw, [SRC0], #4
581	str	TMP_Xw, [DST], #4
582	sub	LEN, LEN, #4
5831:
584	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
585	tbz	LEN, #4, 1f
586	ldp	DATA0, DATA1, [SRC0], #16
587	stp	DATA0, DATA1, [DST], #16
5881:
589	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
590	tbz	LEN, #3, 1f
591	ldr	TMP_X, [SRC0], #8
592	str	TMP_X, [DST], #8
5931:
594	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
595	tbz	LEN, #2, 1f
596	ldr	TMP_Xw, [SRC0], #4
597	str	TMP_Xw, [DST], #4
5981:
599	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
600	tbz	LEN, #1, 1f
601	ldrh	TMP_Xw, [SRC0], #2
602	strh	TMP_Xw, [DST], #2
6031:
604	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
605	tbz	LEN, #0, 1f
606	ldrb	TMP_Xw, [SRC0], #1
607	strb	TMP_Xw, [DST], #1
6081:
609	ret
610
611notaligned_forward_small:
612	/* src and dst are not aligned... */
613	prfm	PLDL1KEEP, [SRC0]
614	prfm	PLDL1KEEP, [SRC0, #8]
615	prfm	PLDL1KEEP, [SRC0, #16]
616	add	TMP_S, SRC0, LEN	/* tmp_s = src + len */
6171:					/* do { */
618	ldrb	TMP_Xw, [SRC0], #1
619	strb	TMP_Xw, [DST], #1	/*  *(char *)dst++ = *(char *)src++ */
620	cmp	SRC0, TMP_S		/* while (src < tmp_s); */
621	blo	1b
622	ret
623
624strict_forward:
625	/* src or dst may be unaligned */
626	and	SRC_ALIGNBIT, SRC0, #7
627	and	DST_ALIGNBIT, DST0, #7
628	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
629	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
630	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
631	cbz	SRC_DST_ALIGNBIT, copy_forward	/* same alignment? */
632
633	and	SRC, SRC0, #~7
634	and	DST, DST0, #~7
635	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
636
637#if BYTE_ORDER == LITTLE_ENDIAN
638	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
639	ldp	DATA1, DATA0, [SRC], #16
640	neg	TMP_X, SRC_ALIGNBIT
641	lsr	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
642	lsl	TMP_X, DATA0, TMP_X		/*  (data1 >> src_alignbit) | */
643	orr	DATA1, DATA1, TMP_X		/*  (data0 << -src_alignbit); */
644	b	9f
6455:
646	ldr	DATA0, [SRC], #8
647	lsr	DATA1, DATA0, SRC_ALIGNBIT
6489:
649
650	cbz	DST_ALIGNBIT, 5f
651	mov	TMP_D, DST0
652	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
653	tbz	TMP_D, #0, 1f
654	strb	DATA1w, [TMP_D], #1
655	lsr	DATA1, DATA1, #8
6561:
657	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
658	tbz	TMP_D, #1, 1f
659	strh	DATA1w, [TMP_D], #2
660	lsr	DATA1, DATA1, #16
6611:
662	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
663	tbz	TMP_D, #2, 1f
664	str	DATA1w, [TMP_D], #4
6651:
666	add	DST, DST, #8
667	b	9f
6685:
669	str	DATA1, [DST], #8
6709:
671	sub	LEN, LEN, #8
672	add	LEN, LEN, DST_ALIGNBIT, lsr #3
673#else /* BYTE_ORDER */
674	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
675	ldp	DATA1, DATA0, [SRC], #16
676	neg	TMP_X, SRC_ALIGNBIT
677	lsl	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
678	lsr	TMP_X, DATA0, TMP_X		/*  (data1 << src_alignbit) | */
679	orr	DATA1, DATA1, TMP_X		/*  (data0 >> -src_alignbit); */
680	b	9f
6815:
682	ldr	DATA0, [SRC], #8
683	lsl	DATA1, DATA0, SRC_ALIGNBIT
6849:
685
686	cbz	DST_ALIGNBIT, 5f
687	mov	TMP_D, DST0
688	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
689	tbz	TMP_D, #0, 1f
690	lsr	TMP_X, DATA1, #56
691	strb	TMP_Xw, [TMP_D], #1
6921:
693	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
694	tbz	TMP_D, #1, 1f
695	lsr	TMP_X, DATA1, #48
696	strh	TMP_Xw, [TMP_D], #2
6971:
698	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
699	tbz	TMP_D, #2, 1f
700	lsr	TMP_X, DATA1, #32
701	str	TMP_Xw, [TMP_D], #4
7021:
703	add	DST, DST, #8
704	b	9f
7055:
706	str	DATA1, [DST], #8
7079:
708	sub	LEN, LEN, #8
709	add	LEN, LEN, DST_ALIGNBIT, lsr #3
710#endif /* BYTE_ORDER */
711
712shifting_copy_loop:
713	ldp	DATA1, DATA2, [SRC], #16
714#if BYTE_ORDER == LITTLE_ENDIAN
715	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
716	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
717	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
718	orr	DATA0, DATA0, TMP_X
719	/* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
720	lsr	DATA1, DATA1, SRC_DST_ALIGNBIT
721	lsl	TMP_X, DATA2, DST_SRC_ALIGNBIT
722	orr	DATA1, DATA1, TMP_X
723#else /* BYTE_ORDER */
724	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
725	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
726	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
727	orr	DATA0, DATA0, TMP_X
728	/* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
729	lsl	DATA1, DATA1, SRC_DST_ALIGNBIT
730	lsr	TMP_X, DATA2, DST_SRC_ALIGNBIT
731	orr	DATA1, DATA1, TMP_X
732#endif /* BYTE_ORDER */
733	stp	DATA0, DATA1, [DST], #16
734	mov	DATA0, DATA2
735	sub	LEN, LEN, #16
736	cmp	LEN, #16
737	bhs	shifting_copy_loop
738
739
740	/* write 8 bytes */
741	tbz	LEN, #3, 9f
742	ldr	DATA1, [SRC], #8
743#if BYTE_ORDER == LITTLE_ENDIAN
744	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
745	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
746	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
747	orr	DATA0, DATA0, TMP_X
748#else /* BYTE_ORDER */
749	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
750	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
751	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
752	orr	DATA0, DATA0, TMP_X
753#endif /* BYTE_ORDER */
754	str	DATA0, [DST], #8
755	mov	DATA0, DATA1
756	sub	LEN, LEN, #8
7579:
758
759	cbz	LEN, shifting_copy_done
760
761	/* copy last 1-7 bytes */
762	and	TMP_X, DST_SRC_ALIGNBIT, #63
763	cmp	LEN, TMP_X, lsr #3
764	bls	1f
765	ldr	DATA1, [SRC], #8	/* don't access out of range */
7661:
767
768#if BYTE_ORDER == LITTLE_ENDIAN
769	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
770	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
771	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
772	orr	DATA0, DATA0, TMP_X
773#else /* BYTE_ORDER */
774	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
775	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
776	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
777	orr	DATA0, DATA0, TMP_X
778#endif /* BYTE_ORDER */
779
780#if BYTE_ORDER == LITTLE_ENDIAN
781	/* if (len & 4) { *(uint32_t *)dst++ = data0; } */
782	tbz	LEN, #2, 1f
783	str	DATA0w, [DST], #4
784	lsr	DATA0, DATA0, #32
7851:
786	/* if (len & 2) { *(uint16_t *)dst++ = data0; } */
787	tbz	LEN, #1, 1f
788	strh	DATA0w, [DST], #2
789	lsr	DATA0, DATA0, #16
7901:
791	/* if (len & 1) { *(uint8_t *)dst++ = data0; } */
792	tbz	LEN, #0, 1f
793	strb	DATA0w, [DST], #1
7941:
795#else /* BYTE_ORDER */
796	/* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
797	tbz	LEN, #2, 1f
798	lsr	TMP_X, DATA0, #32
799	str	TMP_Xw, [DST], #4
8001:
801	/* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
802	tbz	LEN, #1, 1f
803	lsr	TMP_X, DATA0, #16
804	strh	TMP_Xw, [DST], #2
8051:
806	/* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
807	tbz	LEN, #0, 1f
808	lsr	TMP_X, DATA0, #8
809	strb	TMP_Xw, [DST], #1
8101:
811#endif /* BYTE_ORDER */
812shifting_copy_done:
813	ret
814
815#else /* STRICT_ALIGNMENT */
816#ifndef NO_OVERLAP
817	cbz	LEN, done
818	cmp	SRC0, DST0
819	beq	done
820	bcc	backward_ignore_align
821#endif /* NO_OVERLAP */
822
823	prfm	PLDL1KEEP, [SRC0]
824	cmp	LEN, #SMALLSIZE
825	bcs	copy_forward
826	mov	DST, DST0
827
828copy_forward_small:
829	cmp	LEN, #8
830	bcs	9f
831
832	/* 0 <= len < 8 */
833	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
834	tbz	LEN, #2, 1f
835	ldr	TMP_Xw, [SRC0], #4
836	str	TMP_Xw, [DST], #4
8371:
838	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
839	tbz	LEN, #1, 1f
840	ldrh	TMP_Xw, [SRC0], #2
841	strh	TMP_Xw, [DST], #2
8421:
843	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
844	tbz	LEN, #0, 1f
845	ldrb	TMP_Xw, [SRC0], #1
846	strb	TMP_Xw, [DST], #1
8471:
848	ret
8499:
850
851	prfm	PLDL1KEEP, [SRC0, #8]
852	cmp	LEN, #16
853	bcs	9f
854
855	/* 8 <= len < 16 */
856	/* *(uint64_t *)dst++ = *(uint64_t *)src++; */
857	ldr	TMP_X, [SRC0], #8
858	str	TMP_X, [DST], #8
859	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
860	tbz	LEN, #2, 1f
861	ldr	TMP_Xw, [SRC0], #4
862	str	TMP_Xw, [DST], #4
8631:
864	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
865	tbz	LEN, #1, 1f
866	ldrh	TMP_Xw, [SRC0], #2
867	strh	TMP_Xw, [DST], #2
8681:
869	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
870	tbz	LEN, #0, 1f
871	ldrb	TMP_Xw, [SRC0], #1
872	strb	TMP_Xw, [DST], #1
8731:
874	ret
8759:
876
877	/* 16 <= len < 32 */
878	prfm	PLDL1KEEP, [SRC0, 16]
879	prfm	PLDL1KEEP, [SRC0, 24]
880	ldp	DATA0, DATA1, [SRC0], #16
881	stp	DATA0, DATA1, [DST], #16
882	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
883	tbz	LEN, #3, 1f
884	ldr	TMP_X, [SRC0], #8
885	str	TMP_X, [DST], #8
8861:
887	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
888	tbz	LEN, #2, 1f
889	ldr	TMP_Xw, [SRC0], #4
890	str	TMP_Xw, [DST], #4
8911:
892	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
893	tbz	LEN, #1, 1f
894	ldrh	TMP_Xw, [SRC0], #2
895	strh	TMP_Xw, [DST], #2
8961:
897	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
898	tbz	LEN, #0, 1f
899	ldrb	TMP_Xw, [SRC0], #1
900	strb	TMP_Xw, [DST], #1
9011:
902	ret
903#endif /* !STRICT_ALIGNMENT */
904
905	.align	4
906copy_forward:
907	/* DST is not aligned at this point */
908	mov	DST, DST0
909#ifndef STRICT_ALIGNMENT
910	cmp	LEN, #512	/* pre-alignment can be overhead when small */
911	bcc	9f
912#endif /* STRICT_ALIGNMENT */
913	/* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
914	tbz	DST, #0, 1f
915	ldrb	TMP_Xw, [SRC0], #1
916	strb	TMP_Xw, [DST], #1
917	sub	LEN, LEN, #1
9181:
919	/* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
920	tbz	DST, #1, 1f
921	ldrh	TMP_Xw, [SRC0], #2
922	strh	TMP_Xw, [DST], #2
923	sub	LEN, LEN, #2
9241:
925	/* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
926	tbz	DST, #2, 1f
927	ldr	TMP_Xw, [SRC0], #4
928	str	TMP_Xw, [DST], #4
929	sub	LEN, LEN, #4
9301:
931#if (STP_ALIGN > 8)
932	/* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
933	tbz	DST, #3, 1f
934	ldr	TMP_X, [SRC0], #8
935	str	TMP_X, [DST], #8
936	sub	LEN, LEN, #8
9371:
938#endif /* (STP_ALIGN > 8) */
9399:
940
941	cmp	LEN, #1024
942	bhs	forward_copy1k
943forward_less1k:
944	/* copy 16*n bytes */
945	and	TMP_D, LEN, #(1023-15)		/* len &= 1023; len &= ~15; */
946	adr	TMP_X, 8f
947	sub	LEN, LEN, TMP_D
948	sub	TMP_X, TMP_X, TMP_D, lsr #1	/* jump to (8f - len/2) */
949	br	TMP_X
950forward_copy1k:	/* copy 16*64 bytes */
951	sub	LEN, LEN, #1024
952	.rept	(1024 / 16)
953	ldp	DATA0, DATA1, [SRC0], #16	/* *dst++ = *src++; */
954	stp	DATA0, DATA1, [DST], #16
955	.endr
9568:
957	cbz	LEN, done
958	cmp	LEN, #1024
959	bhs	forward_copy1k
960	cmp	LEN, #16
961	bhs	forward_less1k
962
963	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
964	tbz	LEN, #4, 1f
965	ldp	DATA0, DATA1, [SRC0], #16
966	stp	DATA0, DATA1, [DST], #16
9671:
968	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
969	tbz	LEN, #3, 1f
970	ldr	TMP_X, [SRC0], #8
971	str	TMP_X, [DST], #8
9721:
973	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
974	tbz	LEN, #2, 1f
975	ldr	TMP_Xw, [SRC0], #4
976	str	TMP_Xw, [DST], #4
9771:
978	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
979	tbz	LEN, #1, 1f
980	ldrh	TMP_Xw, [SRC0], #2
981	strh	TMP_Xw, [DST], #2
9821:
983	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
984	tbz	LEN, #0, 1f
985	ldrb	TMP_Xw, [SRC0], #1
986	strb	TMP_Xw, [DST], #1
9871:
988done:
989	ret
990END(FUNCTION)
991