xref: /minix3/common/lib/libc/arch/arm/string/memcpy_neon.S (revision 84d9c625bfea59e274550651111ae9edfdc40fbd)
1/*-
2 * Copyright (c) 2013 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Matt Thomas of 3am Software Foundry.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include <machine/asm.h>
31
32RCSID("$NetBSD: memcpy_neon.S,v 1.1 2013/01/03 09:34:44 matt Exp $")
33
34	.text
35ENTRY(memcpy)
36	teq	r2, #0			/* 0 length? */
37	cmpne	r0, r1			/*   if not, does src == dst? */
38	RETc(eq)			/*   yes, (to either) return */
39
40	mov	r3, r0			/* keep r0 unchanged */
41#if 0
42	cmp	r2, #16			/* copy less than 8 bytes? */
43	bge	.Ldst_aligner		/*   nope, do it the long way */
44
451:	ldrb	ip, [r1], #1		/* load a byte from src */
46	subs	r2, r2, #1		/* and more to transfer? */
47	strb	ip, [r3], #1		/* save it to dst */
48	bne	1b			/*   yes, do next byte */
49	RET				/* return */
50#endif
51
52.Ldst_aligner:
53	tst	r3, #7			/* is dst pointer word aligned? */
54	beq	.Lsrc_aligner		/*   yes, check src pointer */
55	/*
56	 * Until the dst pointer is word aligned, read src and dst byte by
57	 * byte until it is aligned or we've copied everything.
58	 */
59	ldrb	ip, [r1], #1		/* load a byte from src */
60	strb	ip, [r3], #1		/* save the byte to dst */
61	subs	r2, r2, #1		/* end of transfer? */
62	bne	.Ldst_aligner		/*   no, try next byte */
63	RET				/* yes, we're done! */
64
65.Lsrc_aligner:
66	push	{r4-r5}			/* save some registers */
67	add	r4, r2, r3		/* keep a pointer to the end of src */
68	ands	r5, r1, #7		/* get misalignment of src pointer */
69	beq	.Lcongruent_main	/*   aligned, do it the fast way */
70
71	vdup.8	d1, r5			/* set offset for table */
72	rsb	r5, r5, #8		/* calculate leftover of each word */
73	bic	r1, r1, #7		/* dword align src pointer */
74
75	vldr	d0, .Ltbl_value		/* load table value */
76	vadd.u8	d0, d0, d1		/* add offset to it */
77
78	vld1.64 {d1}, [r1:64]!		/* load a dword from src */
79
80	cmp	r2, r5			/* do we already have enough? */
81	bgt	.Lincongruent		/*   no, so read more */
82
83.Lincongruent_finish:
84	vtbl.8	d0, {d1-d2}, d0		/* merge last dwords */
85	cmp	r2, #8			/* room for a full dword? */
86#ifdef __ARMEB__
87	vrev64.32 d0, d0		/* word swap to LE */
88#endif
89	blt	.Lfinish		/*   no, write final partial dword */
90	vst1.32 {d0}, [r3:64]		/*   yes, write final full dword */
91	b	.Ldone			/* and we're done! */
92
93.Lincongruent:
94	vld1.64 {d2}, [r1:64]!		/* load a dword */
95	cmp	r2, #8			/* can we write a full dword? */
96	blt	.Lincongruent_finish	/*   no, finish it. */
97	vtbl.8	d1, {d1-d2}, d0		/* reorder */
98	vst1.64 {d1}, [r3:64]!		/* store a dword */
99	subs	r2, r2, #8		/* have we written everything? */
100	beq	.Ldone			/*   yes, we're done! */
101	vmov	d1, d2			/* prepare for next dword */
102	tst	r3, #63			/* are we 64-byte aligned? */
103	bne	.Lincongruent		/*   no, load next dword */
104
105	/*
106	 * We are now 64-byte aligneds so all writes should fill one or more
107	 * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
108	 * still need to read 4 dwords (3 full dwords and 1 dword for that
109	 * last byte).
110	 */
111	cmp	r2, #32			/* can we write 4 more dwords? */
112	blt	.Lincongruent_dword	/*   no, handle dword by dword */
113	vld1.64 {d2-d5}, [r1:64]!	/* read 4 dwords */
114	cmp	r2, #64			/* can we write 4 more dwords? */
115	blt	.Lincongruent_4dword	/*   no, handle it */
116
1171:	vld1.64 {d7-d10}, [r1:64]!	/* read 4 dwords */
118	vtbl.8	d1, {d1-d2}, d0		/* reorder */
119	vtbl.8	d2, {d2-d3}, d0		/* reorder */
120	vtbl.8	d3, {d3-d4}, d0		/* reorder */
121	vtbl.8	d4, {d4-d5}, d0		/* reorder */
122	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
123	vmov	d6, d5			/* move out of the way the load */
124	cmp	r2, #96			/* have 8+4 dwords to write? */
125	blt	2f			/*   no more data, skip the load */
126	vld1.64 {d2-d5}, [r1:64]!	/* more data, load 4 dwords */
1272:	vtbl.8	d6, {d6-d7}, d0		/* reorder */
128	vtbl.8	d7, {d7-d8}, d0		/* reorder */
129	vtbl.8	d8, {d8-d9}, d0		/* reorder */
130	vtbl.8	d9, {d9-d10}, d0	/* reorder */
131	vst1.64 {d6-d9}, [r3:64]!	/* write 4 dwords */
132	subs	r2, r2, #64
133	beq	.Ldone
134	vmov	d1, d10
135	cmp	r2, #64
136	bge	1b
137
138	/*
139	 * we have leftovers in d1 and new untranslated date in d2-d5.
140	 */
141.Lincongruent_4dword:
142	cmp	r2, #32
143	blt	.Lincongruent_dword
144
145	vtbl.8	d1, {d1-d2}, d0		/* reorder */
146	vtbl.8	d2, {d2-d3}, d0		/* reorder */
147	vtbl.8	d3, {d3-d4}, d0		/* reorder */
148	vtbl.8	d4, {d4-d5}, d0		/* reorder */
149	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
150	vmov	d1, d5			/* move leftovers */
151	subs	r2, r2, #32
152	beq	.Ldone
153
154.Lincongruent_dword:
155#if 0
156	cmp	r2, r5			/* enough in leftovers? */
157	ble	.Lincongruent_finish	/*   yes, finish it. */
158	vld1.64 {d2}, [r1:64]!		/* load a dword */
159	cmp	r2, #8			/* can we write a full dword? */
160	blt	.Lincongruent_finish	/*   no, finish it. */
161	vtbl.8	d1, {d1-d2}, d0		/* reorder */
162	vst1.64 {d1}, [r3:64]!		/* store a dword */
163	subs	r2, r2, #8		/* have we written everything? */
164	beq	.Ldone			/*   yes, we're done! */
165	b	.Lincongruent_dword	/* and go get it */
166#else
167	cmp	r2, r5			/* are the bytes we have enough? */
168	ble	.Lincongruent_finish	/*   yes, finish it. */
169	mov	ip, r2			/* get remaining count */
170	bic	ip, ip, #7		/* truncate to a dword */
171	rsb	ip, ip, #32		/* subtract from 32 */
172	ands	r2, r2, #7		/* count mod 8 */
173	add	pc, pc, ip, lsl #1	/* and jump! */
174	nop
175	vld1.64 {d2}, [r1:64]!		/* load a dword */
176	vtbl.8	d1, {d1-d2}, d0		/* reorder */
177	vst1.64 {d1}, [r3:64]!		/* store a dword */
178	vmov	d1, d2			/* prepare for next dword */
179	vld1.64 {d2}, [r1:64]!		/* load a dword */
180	vtbl.8	d1, {d1-d2}, d0		/* reorder */
181	vst1.64 {d1}, [r3:64]!		/* store a dword */
182	vmov	d1, d2			/* prepare for next dword */
183	vld1.64 {d2}, [r1:64]!		/* load a dword */
184	vtbl.8	d1, {d1-d2}, d0		/* reorder */
185	vst1.64 {d1}, [r3:64]!		/* store a dword */
186	vmov	d1, d2			/* prepare for next dword */
187	vld1.64 {d2}, [r1:64]!		/* load a dword */
188	vtbl.8	d1, {d1-d2}, d0		/* reorder */
189	vst1.64 {d1}, [r3:64]!		/* store a dword */
190	vmov	d1, d2			/* prepare for next dword */
191	beq	.Ldone
192	vld1.64 {d2}, [r1:64]!		/* load a dword */
193	b	.Lincongruent_finish	/* write last partial dowrd */
194#endif
195
196.Lcongruent_main:
197	vld1.32 {d0}, [r1:64]!		/* load next dword */
198	cmp	r2, #8			/* compare current ptr against end */
199	blt	.Lfinish		/*   greater so write final dword */
200	vst1.32 {d0}, [r3:64]!		/* store dword */
201	subs	r2, r2, #8		/* compare current ptr against end */
202	beq	.Ldone			/*   equal? we're done! */
203	tst	r3, #63			/* have we hit a 64-byte boundary? */
204	bne	.Lcongruent_main	/*   no, write next word */
205
206	cmp	r2, #64			/* can we write 4 dwords? */
207	blt	.Lcongruent_loop	/*   no, this dword by dword */
208	vldm	r1!, {d0-d7}		/* load next 7 dwords */
209	cmp	r2, #128		/* can we write 16 dwords */
210	blt	3f			/*   no, then deal with 8 dwords */
211
212	/*
213	 * The following writes two 64-byte interleaving stores and loads.
214	 */
2151:	vldm	r1!, {d8-d15}		/* load next 8 dwords */
216	vstm	r3!, {d0-d7}		/* store 8 more dwords */
217	cmp	r2, #192		/* can we write 16+8 dwords? */
218	blt	2f			/*   no, don't load the next 8 dwords */
219	vldm	r1!, {d0-d7}		/*   yes, load next 8 dwords */
2202:	vstm	r3!, {d8-d15}		/* store 8 more dwords */
221	sub	r2, r2, #128		/* we just stored 16 (8+8) dwords */
222	beq	.Ldone			/*   if 0, we're done! */
223	cmp	r2, #128		/* can we write 16 dwords */
224	bge	1b			/*   yes, do it again */
225	cmp	r2, #64			/* have we loaded 8 dwords? */
226	blt	.Lcongruent_loop	/*   no, proceed to do it dword */
227
228	/*
229	 * We now have 8 dwords we can write in d0-d7.
230	 */
2313:	vstm	r3!, {d0-d7}		/* store 8 more dwords */
232	subs	r2, r2, #64		/* we wrote 8 dwords */
233	beq	.Ldone			/*   if 0, we're done! */
234
235.Lcongruent_loop:
236	vld1.32 {d0}, [r1]!		/* load dword from src */
237	cmp	r2, #8			/* can we write a full dword? */
238	blt	.Lfinish		/*   no, write last partial dword */
239.Lcongruent_loop_start:
240	vst1.32 {d0}, [r3]!		/* store dword into dst */
241	subs	r2, r2, #8		/* subtract it from length */
242	beq	.Ldone			/*   if 0, we're done! */
243	vld1.32 {d0}, [r1]!		/* load dword from src */
244	cmp	r2, #8			/* can we write a full dword? */
245	bge	.Lcongruent_loop_start	/*   yes, so do it */
246
247.Lfinish:
248	vmov	r4, r5, d0		/* get last dword from NEON */
249	tst	r2, #4			/* do we have at least 4 bytes left? */
250	strne	r4, [r3], #4		/* store the 1st word */
251	movne	r4, r5			/* move 2nd word into place */
252	tst	r2, #2			/* do we have at least 2 bytes left? */
253#ifdef __ARMEB__
254	movne	r4, r4, ror #16		/*   yes, swap halfwords */
255#endif
256	strneh	r4, [r3], #2		/*   yes, store the halfword */
257#ifdef __ARMEL__
258	movne	r4, r4, lsr #16		/*   yes, discard just written bytes */
259#endif
260	tst	r2, #1			/* do we have a final byte? */
261#ifdef __ARMEB__
262	movne	r4, r4, lsr #24		/*   yes, move MSB to LSB */
263#endif
264	strneb	r4, [r3], #1		/*   yes, store it */
265
266.Ldone:
267	pop	{r4-r5}			/* restore registers */
268	RET
269
270	.p2align 3
271.Ltbl_value:
272#ifdef __ARMEL__
273	.quad	0x0706050403020100
274#else
275	.quad	0x0001020304050607
276#endif
277END(memcpy)
278