xref: /minix3/common/lib/libc/arch/arm/string/memcpy_neon.S (revision 84d9c625bfea59e274550651111ae9edfdc40fbd)
1*84d9c625SLionel Sambuc/*-
2*84d9c625SLionel Sambuc * Copyright (c) 2013 The NetBSD Foundation, Inc.
3*84d9c625SLionel Sambuc * All rights reserved.
4*84d9c625SLionel Sambuc *
5*84d9c625SLionel Sambuc * This code is derived from software contributed to The NetBSD Foundation
6*84d9c625SLionel Sambuc * by Matt Thomas of 3am Software Foundry.
7*84d9c625SLionel Sambuc *
8*84d9c625SLionel Sambuc * Redistribution and use in source and binary forms, with or without
9*84d9c625SLionel Sambuc * modification, are permitted provided that the following conditions
10*84d9c625SLionel Sambuc * are met:
11*84d9c625SLionel Sambuc * 1. Redistributions of source code must retain the above copyright
12*84d9c625SLionel Sambuc *    notice, this list of conditions and the following disclaimer.
13*84d9c625SLionel Sambuc * 2. Redistributions in binary form must reproduce the above copyright
14*84d9c625SLionel Sambuc *    notice, this list of conditions and the following disclaimer in the
15*84d9c625SLionel Sambuc *    documentation and/or other materials provided with the distribution.
16*84d9c625SLionel Sambuc *
17*84d9c625SLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18*84d9c625SLionel Sambuc * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19*84d9c625SLionel Sambuc * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20*84d9c625SLionel Sambuc * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21*84d9c625SLionel Sambuc * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22*84d9c625SLionel Sambuc * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23*84d9c625SLionel Sambuc * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24*84d9c625SLionel Sambuc * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25*84d9c625SLionel Sambuc * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26*84d9c625SLionel Sambuc * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27*84d9c625SLionel Sambuc * POSSIBILITY OF SUCH DAMAGE.
28*84d9c625SLionel Sambuc */
29*84d9c625SLionel Sambuc
30*84d9c625SLionel Sambuc#include <machine/asm.h>
31*84d9c625SLionel Sambuc
32*84d9c625SLionel SambucRCSID("$NetBSD: memcpy_neon.S,v 1.1 2013/01/03 09:34:44 matt Exp $")
33*84d9c625SLionel Sambuc
34*84d9c625SLionel Sambuc	.text
35*84d9c625SLionel SambucENTRY(memcpy)
36*84d9c625SLionel Sambuc	teq	r2, #0			/* 0 length? */
37*84d9c625SLionel Sambuc	cmpne	r0, r1			/*   if not, does src == dst? */
38*84d9c625SLionel Sambuc	RETc(eq)			/*   yes, (to either) return */
39*84d9c625SLionel Sambuc
40*84d9c625SLionel Sambuc	mov	r3, r0			/* keep r0 unchanged */
41*84d9c625SLionel Sambuc#if 0
42*84d9c625SLionel Sambuc	cmp	r2, #16			/* copy less than 8 bytes? */
43*84d9c625SLionel Sambuc	bge	.Ldst_aligner		/*   nope, do it the long way */
44*84d9c625SLionel Sambuc
45*84d9c625SLionel Sambuc1:	ldrb	ip, [r1], #1		/* load a byte from src */
46*84d9c625SLionel Sambuc	subs	r2, r2, #1		/* and more to transfer? */
47*84d9c625SLionel Sambuc	strb	ip, [r3], #1		/* save it to dst */
48*84d9c625SLionel Sambuc	bne	1b			/*   yes, do next byte */
49*84d9c625SLionel Sambuc	RET				/* return */
50*84d9c625SLionel Sambuc#endif
51*84d9c625SLionel Sambuc
52*84d9c625SLionel Sambuc.Ldst_aligner:
53*84d9c625SLionel Sambuc	tst	r3, #7			/* is dst pointer word aligned? */
54*84d9c625SLionel Sambuc	beq	.Lsrc_aligner		/*   yes, check src pointer */
55*84d9c625SLionel Sambuc	/*
56*84d9c625SLionel Sambuc	 * Until the dst pointer is word aligned, read src and dst byte by
57*84d9c625SLionel Sambuc	 * byte until it is aligned or we've copied everything.
58*84d9c625SLionel Sambuc	 */
59*84d9c625SLionel Sambuc	ldrb	ip, [r1], #1		/* load a byte from src */
60*84d9c625SLionel Sambuc	strb	ip, [r3], #1		/* save the byte to dst */
61*84d9c625SLionel Sambuc	subs	r2, r2, #1		/* end of transfer? */
62*84d9c625SLionel Sambuc	bne	.Ldst_aligner		/*   no, try next byte */
63*84d9c625SLionel Sambuc	RET				/* yes, we're done! */
64*84d9c625SLionel Sambuc
65*84d9c625SLionel Sambuc.Lsrc_aligner:
66*84d9c625SLionel Sambuc	push	{r4-r5}			/* save some registers */
67*84d9c625SLionel Sambuc	add	r4, r2, r3		/* keep a pointer to the end of src */
68*84d9c625SLionel Sambuc	ands	r5, r1, #7		/* get misalignment of src pointer */
69*84d9c625SLionel Sambuc	beq	.Lcongruent_main	/*   aligned, do it the fast way */
70*84d9c625SLionel Sambuc
71*84d9c625SLionel Sambuc	vdup.8	d1, r5			/* set offset for table */
72*84d9c625SLionel Sambuc	rsb	r5, r5, #8		/* calculate leftover of each word */
73*84d9c625SLionel Sambuc	bic	r1, r1, #7		/* dword align src pointer */
74*84d9c625SLionel Sambuc
75*84d9c625SLionel Sambuc	vldr	d0, .Ltbl_value		/* load table value */
76*84d9c625SLionel Sambuc	vadd.u8	d0, d0, d1		/* add offset to it */
77*84d9c625SLionel Sambuc
78*84d9c625SLionel Sambuc	vld1.64 {d1}, [r1:64]!		/* load a dword from src */
79*84d9c625SLionel Sambuc
80*84d9c625SLionel Sambuc	cmp	r2, r5			/* do we already have enough? */
81*84d9c625SLionel Sambuc	bgt	.Lincongruent		/*   no, so read more */
82*84d9c625SLionel Sambuc
83*84d9c625SLionel Sambuc.Lincongruent_finish:
84*84d9c625SLionel Sambuc	vtbl.8	d0, {d1-d2}, d0		/* merge last dwords */
85*84d9c625SLionel Sambuc	cmp	r2, #8			/* room for a full dword? */
86*84d9c625SLionel Sambuc#ifdef __ARMEB__
87*84d9c625SLionel Sambuc	vrev64.32 d0, d0		/* word swap to LE */
88*84d9c625SLionel Sambuc#endif
89*84d9c625SLionel Sambuc	blt	.Lfinish		/*   no, write final partial dword */
90*84d9c625SLionel Sambuc	vst1.32 {d0}, [r3:64]		/*   yes, write final full dword */
91*84d9c625SLionel Sambuc	b	.Ldone			/* and we're done! */
92*84d9c625SLionel Sambuc
93*84d9c625SLionel Sambuc.Lincongruent:
94*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
95*84d9c625SLionel Sambuc	cmp	r2, #8			/* can we write a full dword? */
96*84d9c625SLionel Sambuc	blt	.Lincongruent_finish	/*   no, finish it. */
97*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
98*84d9c625SLionel Sambuc	vst1.64 {d1}, [r3:64]!		/* store a dword */
99*84d9c625SLionel Sambuc	subs	r2, r2, #8		/* have we written everything? */
100*84d9c625SLionel Sambuc	beq	.Ldone			/*   yes, we're done! */
101*84d9c625SLionel Sambuc	vmov	d1, d2			/* prepare for next dword */
102*84d9c625SLionel Sambuc	tst	r3, #63			/* are we 64-byte aligned? */
103*84d9c625SLionel Sambuc	bne	.Lincongruent		/*   no, load next dword */
104*84d9c625SLionel Sambuc
105*84d9c625SLionel Sambuc	/*
106*84d9c625SLionel Sambuc	 * We are now 64-byte aligneds so all writes should fill one or more
107*84d9c625SLionel Sambuc	 * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
108*84d9c625SLionel Sambuc	 * still need to read 4 dwords (3 full dwords and 1 dword for that
109*84d9c625SLionel Sambuc	 * last byte).
110*84d9c625SLionel Sambuc	 */
111*84d9c625SLionel Sambuc	cmp	r2, #32			/* can we write 4 more dwords? */
112*84d9c625SLionel Sambuc	blt	.Lincongruent_dword	/*   no, handle dword by dword */
113*84d9c625SLionel Sambuc	vld1.64 {d2-d5}, [r1:64]!	/* read 4 dwords */
114*84d9c625SLionel Sambuc	cmp	r2, #64			/* can we write 4 more dwords? */
115*84d9c625SLionel Sambuc	blt	.Lincongruent_4dword	/*   no, handle it */
116*84d9c625SLionel Sambuc
117*84d9c625SLionel Sambuc1:	vld1.64 {d7-d10}, [r1:64]!	/* read 4 dwords */
118*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
119*84d9c625SLionel Sambuc	vtbl.8	d2, {d2-d3}, d0		/* reorder */
120*84d9c625SLionel Sambuc	vtbl.8	d3, {d3-d4}, d0		/* reorder */
121*84d9c625SLionel Sambuc	vtbl.8	d4, {d4-d5}, d0		/* reorder */
122*84d9c625SLionel Sambuc	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
123*84d9c625SLionel Sambuc	vmov	d6, d5			/* move out of the way the load */
124*84d9c625SLionel Sambuc	cmp	r2, #96			/* have 8+4 dwords to write? */
125*84d9c625SLionel Sambuc	blt	2f			/*   no more data, skip the load */
126*84d9c625SLionel Sambuc	vld1.64 {d2-d5}, [r1:64]!	/* more data, load 4 dwords */
127*84d9c625SLionel Sambuc2:	vtbl.8	d6, {d6-d7}, d0		/* reorder */
128*84d9c625SLionel Sambuc	vtbl.8	d7, {d7-d8}, d0		/* reorder */
129*84d9c625SLionel Sambuc	vtbl.8	d8, {d8-d9}, d0		/* reorder */
130*84d9c625SLionel Sambuc	vtbl.8	d9, {d9-d10}, d0	/* reorder */
131*84d9c625SLionel Sambuc	vst1.64 {d6-d9}, [r3:64]!	/* write 4 dwords */
132*84d9c625SLionel Sambuc	subs	r2, r2, #64
133*84d9c625SLionel Sambuc	beq	.Ldone
134*84d9c625SLionel Sambuc	vmov	d1, d10
135*84d9c625SLionel Sambuc	cmp	r2, #64
136*84d9c625SLionel Sambuc	bge	1b
137*84d9c625SLionel Sambuc
138*84d9c625SLionel Sambuc	/*
139*84d9c625SLionel Sambuc	 * we have leftovers in d1 and new untranslated date in d2-d5.
140*84d9c625SLionel Sambuc	 */
141*84d9c625SLionel Sambuc.Lincongruent_4dword:
142*84d9c625SLionel Sambuc	cmp	r2, #32
143*84d9c625SLionel Sambuc	blt	.Lincongruent_dword
144*84d9c625SLionel Sambuc
145*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
146*84d9c625SLionel Sambuc	vtbl.8	d2, {d2-d3}, d0		/* reorder */
147*84d9c625SLionel Sambuc	vtbl.8	d3, {d3-d4}, d0		/* reorder */
148*84d9c625SLionel Sambuc	vtbl.8	d4, {d4-d5}, d0		/* reorder */
149*84d9c625SLionel Sambuc	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
150*84d9c625SLionel Sambuc	vmov	d1, d5			/* move leftovers */
151*84d9c625SLionel Sambuc	subs	r2, r2, #32
152*84d9c625SLionel Sambuc	beq	.Ldone
153*84d9c625SLionel Sambuc
154*84d9c625SLionel Sambuc.Lincongruent_dword:
155*84d9c625SLionel Sambuc#if 0
156*84d9c625SLionel Sambuc	cmp	r2, r5			/* enough in leftovers? */
157*84d9c625SLionel Sambuc	ble	.Lincongruent_finish	/*   yes, finish it. */
158*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
159*84d9c625SLionel Sambuc	cmp	r2, #8			/* can we write a full dword? */
160*84d9c625SLionel Sambuc	blt	.Lincongruent_finish	/*   no, finish it. */
161*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
162*84d9c625SLionel Sambuc	vst1.64 {d1}, [r3:64]!		/* store a dword */
163*84d9c625SLionel Sambuc	subs	r2, r2, #8		/* have we written everything? */
164*84d9c625SLionel Sambuc	beq	.Ldone			/*   yes, we're done! */
165*84d9c625SLionel Sambuc	b	.Lincongruent_dword	/* and go get it */
166*84d9c625SLionel Sambuc#else
167*84d9c625SLionel Sambuc	cmp	r2, r5			/* are the bytes we have enough? */
168*84d9c625SLionel Sambuc	ble	.Lincongruent_finish	/*   yes, finish it. */
169*84d9c625SLionel Sambuc	mov	ip, r2			/* get remaining count */
170*84d9c625SLionel Sambuc	bic	ip, ip, #7		/* truncate to a dword */
171*84d9c625SLionel Sambuc	rsb	ip, ip, #32		/* subtract from 32 */
172*84d9c625SLionel Sambuc	ands	r2, r2, #7		/* count mod 8 */
173*84d9c625SLionel Sambuc	add	pc, pc, ip, lsl #1	/* and jump! */
174*84d9c625SLionel Sambuc	nop
175*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
176*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
177*84d9c625SLionel Sambuc	vst1.64 {d1}, [r3:64]!		/* store a dword */
178*84d9c625SLionel Sambuc	vmov	d1, d2			/* prepare for next dword */
179*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
180*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
181*84d9c625SLionel Sambuc	vst1.64 {d1}, [r3:64]!		/* store a dword */
182*84d9c625SLionel Sambuc	vmov	d1, d2			/* prepare for next dword */
183*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
184*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
185*84d9c625SLionel Sambuc	vst1.64 {d1}, [r3:64]!		/* store a dword */
186*84d9c625SLionel Sambuc	vmov	d1, d2			/* prepare for next dword */
187*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
188*84d9c625SLionel Sambuc	vtbl.8	d1, {d1-d2}, d0		/* reorder */
189*84d9c625SLionel Sambuc	vst1.64 {d1}, [r3:64]!		/* store a dword */
190*84d9c625SLionel Sambuc	vmov	d1, d2			/* prepare for next dword */
191*84d9c625SLionel Sambuc	beq	.Ldone
192*84d9c625SLionel Sambuc	vld1.64 {d2}, [r1:64]!		/* load a dword */
193*84d9c625SLionel Sambuc	b	.Lincongruent_finish	/* write last partial dowrd */
194*84d9c625SLionel Sambuc#endif
195*84d9c625SLionel Sambuc
196*84d9c625SLionel Sambuc.Lcongruent_main:
197*84d9c625SLionel Sambuc	vld1.32 {d0}, [r1:64]!		/* load next dword */
198*84d9c625SLionel Sambuc	cmp	r2, #8			/* compare current ptr against end */
199*84d9c625SLionel Sambuc	blt	.Lfinish		/*   greater so write final dword */
200*84d9c625SLionel Sambuc	vst1.32 {d0}, [r3:64]!		/* store dword */
201*84d9c625SLionel Sambuc	subs	r2, r2, #8		/* compare current ptr against end */
202*84d9c625SLionel Sambuc	beq	.Ldone			/*   equal? we're done! */
203*84d9c625SLionel Sambuc	tst	r3, #63			/* have we hit a 64-byte boundary? */
204*84d9c625SLionel Sambuc	bne	.Lcongruent_main	/*   no, write next word */
205*84d9c625SLionel Sambuc
206*84d9c625SLionel Sambuc	cmp	r2, #64			/* can we write 4 dwords? */
207*84d9c625SLionel Sambuc	blt	.Lcongruent_loop	/*   no, this dword by dword */
208*84d9c625SLionel Sambuc	vldm	r1!, {d0-d7}		/* load next 7 dwords */
209*84d9c625SLionel Sambuc	cmp	r2, #128		/* can we write 16 dwords */
210*84d9c625SLionel Sambuc	blt	3f			/*   no, then deal with 8 dwords */
211*84d9c625SLionel Sambuc
212*84d9c625SLionel Sambuc	/*
213*84d9c625SLionel Sambuc	 * The following writes two 64-byte interleaving stores and loads.
214*84d9c625SLionel Sambuc	 */
215*84d9c625SLionel Sambuc1:	vldm	r1!, {d8-d15}		/* load next 8 dwords */
216*84d9c625SLionel Sambuc	vstm	r3!, {d0-d7}		/* store 8 more dwords */
217*84d9c625SLionel Sambuc	cmp	r2, #192		/* can we write 16+8 dwords? */
218*84d9c625SLionel Sambuc	blt	2f			/*   no, don't load the next 8 dwords */
219*84d9c625SLionel Sambuc	vldm	r1!, {d0-d7}		/*   yes, load next 8 dwords */
220*84d9c625SLionel Sambuc2:	vstm	r3!, {d8-d15}		/* store 8 more dwords */
221*84d9c625SLionel Sambuc	sub	r2, r2, #128		/* we just stored 16 (8+8) dwords */
222*84d9c625SLionel Sambuc	beq	.Ldone			/*   if 0, we're done! */
223*84d9c625SLionel Sambuc	cmp	r2, #128		/* can we write 16 dwords */
224*84d9c625SLionel Sambuc	bge	1b			/*   yes, do it again */
225*84d9c625SLionel Sambuc	cmp	r2, #64			/* have we loaded 8 dwords? */
226*84d9c625SLionel Sambuc	blt	.Lcongruent_loop	/*   no, proceed to do it dword */
227*84d9c625SLionel Sambuc
228*84d9c625SLionel Sambuc	/*
229*84d9c625SLionel Sambuc	 * We now have 8 dwords we can write in d0-d7.
230*84d9c625SLionel Sambuc	 */
231*84d9c625SLionel Sambuc3:	vstm	r3!, {d0-d7}		/* store 8 more dwords */
232*84d9c625SLionel Sambuc	subs	r2, r2, #64		/* we wrote 8 dwords */
233*84d9c625SLionel Sambuc	beq	.Ldone			/*   if 0, we're done! */
234*84d9c625SLionel Sambuc
235*84d9c625SLionel Sambuc.Lcongruent_loop:
236*84d9c625SLionel Sambuc	vld1.32 {d0}, [r1]!		/* load dword from src */
237*84d9c625SLionel Sambuc	cmp	r2, #8			/* can we write a full dword? */
238*84d9c625SLionel Sambuc	blt	.Lfinish		/*   no, write last partial dword */
239*84d9c625SLionel Sambuc.Lcongruent_loop_start:
240*84d9c625SLionel Sambuc	vst1.32 {d0}, [r3]!		/* store dword into dst */
241*84d9c625SLionel Sambuc	subs	r2, r2, #8		/* subtract it from length */
242*84d9c625SLionel Sambuc	beq	.Ldone			/*   if 0, we're done! */
243*84d9c625SLionel Sambuc	vld1.32 {d0}, [r1]!		/* load dword from src */
244*84d9c625SLionel Sambuc	cmp	r2, #8			/* can we write a full dword? */
245*84d9c625SLionel Sambuc	bge	.Lcongruent_loop_start	/*   yes, so do it */
246*84d9c625SLionel Sambuc
247*84d9c625SLionel Sambuc.Lfinish:
248*84d9c625SLionel Sambuc	vmov	r4, r5, d0		/* get last dword from NEON */
249*84d9c625SLionel Sambuc	tst	r2, #4			/* do we have at least 4 bytes left? */
250*84d9c625SLionel Sambuc	strne	r4, [r3], #4		/* store the 1st word */
251*84d9c625SLionel Sambuc	movne	r4, r5			/* move 2nd word into place */
252*84d9c625SLionel Sambuc	tst	r2, #2			/* do we have at least 2 bytes left? */
253*84d9c625SLionel Sambuc#ifdef __ARMEB__
254*84d9c625SLionel Sambuc	movne	r4, r4, ror #16		/*   yes, swap halfwords */
255*84d9c625SLionel Sambuc#endif
256*84d9c625SLionel Sambuc	strneh	r4, [r3], #2		/*   yes, store the halfword */
257*84d9c625SLionel Sambuc#ifdef __ARMEL__
258*84d9c625SLionel Sambuc	movne	r4, r4, lsr #16		/*   yes, discard just written bytes */
259*84d9c625SLionel Sambuc#endif
260*84d9c625SLionel Sambuc	tst	r2, #1			/* do we have a final byte? */
261*84d9c625SLionel Sambuc#ifdef __ARMEB__
262*84d9c625SLionel Sambuc	movne	r4, r4, lsr #24		/*   yes, move MSB to LSB */
263*84d9c625SLionel Sambuc#endif
264*84d9c625SLionel Sambuc	strneb	r4, [r3], #1		/*   yes, store it */
265*84d9c625SLionel Sambuc
266*84d9c625SLionel Sambuc.Ldone:
267*84d9c625SLionel Sambuc	pop	{r4-r5}			/* restore registers */
268*84d9c625SLionel Sambuc	RET
269*84d9c625SLionel Sambuc
270*84d9c625SLionel Sambuc	.p2align 3
271*84d9c625SLionel Sambuc.Ltbl_value:
272*84d9c625SLionel Sambuc#ifdef __ARMEL__
273*84d9c625SLionel Sambuc	.quad	0x0706050403020100
274*84d9c625SLionel Sambuc#else
275*84d9c625SLionel Sambuc	.quad	0x0001020304050607
276*84d9c625SLionel Sambuc#endif
277*84d9c625SLionel SambucEND(memcpy)
278