xref: /minix3/common/lib/libc/arch/arm/string/memset_neon.S (revision 84d9c625bfea59e274550651111ae9edfdc40fbd)
1*84d9c625SLionel Sambuc/*	$NetBSD: memset_neon.S,v 1.1 2012/12/13 01:41:59 matt Exp $	*/
2*84d9c625SLionel Sambuc
3*84d9c625SLionel Sambuc/*-
4*84d9c625SLionel Sambuc * Copyright (c) 2012 The NetBSD Foundation, Inc.
5*84d9c625SLionel Sambuc * All rights reserved.
6*84d9c625SLionel Sambuc *
7*84d9c625SLionel Sambuc * This code is derived from software contributed to The NetBSD Foundation
8*84d9c625SLionel Sambuc * by Matt Thomas of 3am Software Foundry.
9*84d9c625SLionel Sambuc *
10*84d9c625SLionel Sambuc * Redistribution and use in source and binary forms, with or without
11*84d9c625SLionel Sambuc * modification, are permitted provided that the following conditions
12*84d9c625SLionel Sambuc * are met:
13*84d9c625SLionel Sambuc * 1. Redistributions of source code must retain the above copyright
14*84d9c625SLionel Sambuc *    notice, this list of conditions and the following disclaimer.
15*84d9c625SLionel Sambuc * 2. Redistributions in binary form must reproduce the above copyright
16*84d9c625SLionel Sambuc *    notice, this list of conditions and the following disclaimer in the
17*84d9c625SLionel Sambuc *    documentation and/or other materials provided with the distribution.
18*84d9c625SLionel Sambuc *
19*84d9c625SLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20*84d9c625SLionel Sambuc * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21*84d9c625SLionel Sambuc * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22*84d9c625SLionel Sambuc * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23*84d9c625SLionel Sambuc * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*84d9c625SLionel Sambuc * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*84d9c625SLionel Sambuc * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*84d9c625SLionel Sambuc * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*84d9c625SLionel Sambuc * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*84d9c625SLionel Sambuc * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*84d9c625SLionel Sambuc * POSSIBILITY OF SUCH DAMAGE.
30*84d9c625SLionel Sambuc */
31*84d9c625SLionel Sambuc#include <machine/asm.h>
32*84d9c625SLionel Sambuc#include "assym.h"
33*84d9c625SLionel Sambuc
34*84d9c625SLionel Sambuc/*
35*84d9c625SLionel Sambuc * memset: Sets a block of memory to the specified value
36*84d9c625SLionel Sambuc * Using NEON instructions
37*84d9c625SLionel Sambuc *
38*84d9c625SLionel Sambuc * On entry:
39*84d9c625SLionel Sambuc *   r0 - dest address
40*84d9c625SLionel Sambuc *   r1 - byte to write
41*84d9c625SLionel Sambuc *   r2 - number of bytes to write
42*84d9c625SLionel Sambuc *
43*84d9c625SLionel Sambuc * On exit:
44*84d9c625SLionel Sambuc *   r0 - dest address
45*84d9c625SLionel Sambuc */
46*84d9c625SLionel Sambuc/* LINTSTUB: Func: void *memset(void *, int, size_t) */
47*84d9c625SLionel SambucENTRY(memset)
48*84d9c625SLionel Sambuc	and		r3, r1, #0xff	/* We deal with bytes */
49*84d9c625SLionel Sambuc	mov		r1, r2
50*84d9c625SLionel Sambuc	mov		ip, r0		/* r0 needs to stay the same */
51*84d9c625SLionel Sambuc
52*84d9c625SLionel Sambuc	vdup.8		q0, r3		/* move fill to SIMD */
53*84d9c625SLionel Sambuc	/* we no longer need to keep the fill value in a ARM register */
54*84d9c625SLionel Sambuc
55*84d9c625SLionel Sambuc	/* Ok first we will dword align the address */
56*84d9c625SLionel Sambuc	ands		r2, ip, #7	/* grab the bottom three bits */
57*84d9c625SLionel Sambuc	beq		.Lmemset_dwordaligned	/* The addr is dword aligned */
58*84d9c625SLionel Sambuc
59*84d9c625SLionel Sambuc	bic		ip, ip, #7	/* clear bottom three bits of addr */
60*84d9c625SLionel Sambuc	vldr		d7, [ip]	/* load from memory */
61*84d9c625SLionel Sambuc	add		r1, r1, r2	/* add "pre-fill" to length */
62*84d9c625SLionel Sambuc	lsl		r2, r2, #3	/* byte to no-fill bit count */
63*84d9c625SLionel Sambuc
64*84d9c625SLionel Sambuc#ifdef __ARMEB__
65*84d9c625SLionel Sambuc	neg		r1, r1		/* start from the MSB */
66*84d9c625SLionel Sambuc#endif
67*84d9c625SLionel Sambuc	vmov		s4, r2		/* move to SIMD d2 */
68*84d9c625SLionel Sambuc	vmvn.u64	d3, #0		/* set all ones */
69*84d9c625SLionel Sambuc	vshl.u64	d3, d3, d2	/* create a no-fill mask */
70*84d9c625SLionel Sambuc	vmvn.u64	d2, d3		/* invert mask for a fill-mask */
71*84d9c625SLionel Sambuc	vand		d7, d7, d3	/* mask out fill bits */
72*84d9c625SLionel Sambuc	vand		d2, d0, d2	/* mask out no-fill bits */
73*84d9c625SLionel Sambuc	vorr		d7, d2, d7	/* merge fill and memory */
74*84d9c625SLionel Sambuc
75*84d9c625SLionel Sambuc	cmp		r1, #8		/* Do we have less than 8 bytes */
76*84d9c625SLionel Sambuc	movlt		r2, #0		/* indicate this is the last word */
77*84d9c625SLionel Sambuc	blt		.Lmemset_lessthaneight_noload
78*84d9c625SLionel Sambuc
79*84d9c625SLionel Sambuc	vstmia		ip!, {d7}	/* write back to memory */
80*84d9c625SLionel Sambuc	subs		r1, r1, #8	/* and remove 8 bytes from the length */
81*84d9c625SLionel Sambuc	RETc(eq)
82*84d9c625SLionel Sambuc
83*84d9c625SLionel Sambuc	/* We are now doubleword aligned */
84*84d9c625SLionel Sambuc.Lmemset_dwordaligned:
85*84d9c625SLionel Sambuc	vmov		q1, q0		/* put fill in q1 (d2-d3) */
86*84d9c625SLionel Sambuc	vmov		q2, q0		/* put fill in q2 (d4-d5) */
87*84d9c625SLionel Sambuc	vmov		q3, q0		/* put fill in q3 (d6-d7) */
88*84d9c625SLionel Sambuc
89*84d9c625SLionel Sambuc	and		r2, ip, #63	/* check for 64-byte alignment */
90*84d9c625SLionel Sambuc	beq		.Lmemset_8dwordaligned
91*84d9c625SLionel Sambuc	/*
92*84d9c625SLionel Sambuc	 * Let's align to a 64-byte boundary so that stores don't cross
93*84d9c625SLionel Sambuc	 * cacheline boundaries.  We also know we have at least 128-bytes to
94*84d9c625SLionel Sambuc	 * copy so we don't have to worry about the length at the moment.
95*84d9c625SLionel Sambuc	 */
96*84d9c625SLionel Sambuc	rsb		r2, r2, #64	/* how many bytes until 64 bytes */
97*84d9c625SLionel Sambuc	cmp		r1, r2		/* compare against length */
98*84d9c625SLionel Sambuc	andlt		r2, r1, #0x38 	/* if < len, use trunc(len, 8) */
99*84d9c625SLionel Sambuc	subs		r1, r1, r2	/* subtract from len */
100*84d9c625SLionel Sambuc	add		pc, pc, r2	/* and jump to it */
101*84d9c625SLionel Sambuc	nop
102*84d9c625SLionel Sambuc	RETc(eq);			b	.Lmemset_lessthaneight
103*84d9c625SLionel Sambuc	vstmia		ip!, {d0};	b	.Lmemset_8dwordaligned
104*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d1};	b	.Lmemset_8dwordaligned
105*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d2};	b	.Lmemset_8dwordaligned
106*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d3};	b	.Lmemset_8dwordaligned
107*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d4};	b	.Lmemset_8dwordaligned
108*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d5};	b	.Lmemset_8dwordaligned
109*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d6}
110*84d9c625SLionel Sambuc.Lmemset_8dwordaligned:
111*84d9c625SLionel Sambuc	vmov		d0, d1		/* restore in case of unaligned start */
112*84d9c625SLionel Sambuc	cmp		r1, #8		/* do we have less than 8 bytes */
113*84d9c625SLionel Sambuc	movlt		r2, #0		/* indicate last word */
114*84d9c625SLionel Sambuc	blt		.Lmemset_lessthaneight
115*84d9c625SLionel Sambuc
116*84d9c625SLionel Sambuc	cmp		r1, #512
117*84d9c625SLionel Sambuc	blt		.Lmemset_sub512
118*84d9c625SLionel Sambuc
119*84d9c625SLionel Sambuc	/* Do 512 bytes at a time */
120*84d9c625SLionel Sambuc	mov		r2, #512
121*84d9c625SLionel Sambuc.Lmemset_512:
122*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
123*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
124*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
125*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
126*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
127*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
128*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
129*84d9c625SLionel Sambuc	vstmia		ip!, {d0-d7}
130*84d9c625SLionel Sambuc.Lmemset_0:
131*84d9c625SLionel Sambuc	subs		r1, r1, r2
132*84d9c625SLionel Sambuc	RETc(eq)			/* return if done */
133*84d9c625SLionel Sambuc	cmp		r1, #512
134*84d9c625SLionel Sambuc	bge		.Lmemset_512
135*84d9c625SLionel Sambuc
136*84d9c625SLionel Sambuc	/*
137*84d9c625SLionel Sambuc	 * We have less than 512 bytes left, but since the sequence above
138*84d9c625SLionel Sambuc	 * store 64 bytes at a time, we determine the number of instructions
139*84d9c625SLionel Sambuc	 * we need to store the remainder (if >= 64 bytes) and execute that
140*84d9c625SLionel Sambuc	 * many vstmia.
141*84d9c625SLionel Sambuc	 */
142*84d9c625SLionel Sambuc.Lmemset_sub512:
143*84d9c625SLionel Sambuc	lsr		r2, r1, #6	/* divide by 64 */
144*84d9c625SLionel Sambuc	lslne		r4, r2, #2	/* multiply by 4 */
145*84d9c625SLionel Sambuc	addne		r4, r4, #1f + 8 - .Lmemset_0
146*84d9c625SLionel Sambuc					/* add the # of bytes between */
147*84d9c625SLionel Sambuc1:	subne		pc, r4 		/* and go */
148*84d9c625SLionel Sambuc
149*84d9c625SLionel Sambuc	/*
150*84d9c625SLionel Sambuc	 * We have less than 64 bytes to copy on a 8dword aligned address
151*84d9c625SLionel Sambuc	 */
152*84d9c625SLionel Sambuc	and		r2, r1, #56	/* get # of full dwords */
153*84d9c625SLionel Sambuc	ands		r1, r1, #7	/* get # of extra bytes */
154*84d9c625SLionel Sambuc	beq		.Lmemset_finalstore
155*84d9c625SLionel Sambuc	/*
156*84d9c625SLionel Sambuc	 * The last word is a partial fill so load its value and update it
157*84d9c625SLionel Sambuc	 * to include the fill value.
158*84d9c625SLionel Sambuc	 */
159*84d9c625SLionel Sambuc.Lmemset_lessthaneight:
160*84d9c625SLionel Sambuc	vldr		d7, [ip, r2]	/* load the last partial dword */
161*84d9c625SLionel Sambuc.Lmemset_lessthaneight_noload:
162*84d9c625SLionel Sambuc	lsl		r1, r1, #3	/* byte to fill bit count */
163*84d9c625SLionel Sambuc#ifdef __ARMEB__
164*84d9c625SLionel Sambuc	neg		r1, r1		/* start from the MSB */
165*84d9c625SLionel Sambuc#endif
166*84d9c625SLionel Sambuc	vmov		s4, r1		/* move to SIMD d2 */
167*84d9c625SLionel Sambuc	vmvn.u64	d3, #0		/* set all ones */
168*84d9c625SLionel Sambuc	vshl.u64	d3, d3, d2	/* create a no-fill mask */
169*84d9c625SLionel Sambuc	vmvn.u64	d2, d3		/* invert mask */
170*84d9c625SLionel Sambuc	vand		d7, d7, d2	/* keep no-fill bits */
171*84d9c625SLionel Sambuc	vand		d2, d0, d3	/* mask out no-fill bits */
172*84d9c625SLionel Sambuc	vorr		d7, d2, d7	/* merge fill and no-fill */
173*84d9c625SLionel Sambuc	vmov		q1, q0		/* restore d2 & d3 */
174*84d9c625SLionel Sambuc	add		r2, r2, #8	/* compensate for the partial dword */
175*84d9c625SLionel Sambuc.Lmemset_finalstore:
176*84d9c625SLionel Sambuc	add		pc, pc, r2	/* and jump to it */
177*84d9c625SLionel Sambuc	nop
178*84d9c625SLionel Sambuc	vstr		d7, [ip];	RET
179*84d9c625SLionel Sambuc	vstmia		ip, {d6-d7};	RET
180*84d9c625SLionel Sambuc	vstmia		ip, {d5-d7};	RET
181*84d9c625SLionel Sambuc	vstmia		ip, {d4-d7};	RET
182*84d9c625SLionel Sambuc	vstmia		ip, {d3-d7};	RET
183*84d9c625SLionel Sambuc	vstmia		ip, {d2-d7};	RET
184*84d9c625SLionel Sambuc	vstmia		ip, {d1-d7};	RET
185*84d9c625SLionel Sambuc	vstmia		ip, {d0-d7};	RET
186*84d9c625SLionel SambucEND(memset)
187