xref: /freebsd-src/contrib/cortex-strings/src/aarch64/memset.S (revision 8c4282b370bd66908b45b6a223226a9fc2b69d57)
1*09a53ad8SAndrew Turner/* Copyright (c) 2012, Linaro Limited
2*09a53ad8SAndrew Turner   All rights reserved.
3*09a53ad8SAndrew Turner
4*09a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
5*09a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
6*09a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
7*09a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer.
8*09a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
9*09a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer in the
10*09a53ad8SAndrew Turner         documentation and/or other materials provided with the distribution.
11*09a53ad8SAndrew Turner       * Neither the name of the Linaro nor the
12*09a53ad8SAndrew Turner         names of its contributors may be used to endorse or promote products
13*09a53ad8SAndrew Turner         derived from this software without specific prior written permission.
14*09a53ad8SAndrew Turner
15*09a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*09a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*09a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*09a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19*09a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20*09a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21*09a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22*09a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23*09a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*09a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25*09a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26*09a53ad8SAndrew Turner
27*09a53ad8SAndrew Turner/*
28*09a53ad8SAndrew Turner * Copyright (c) 2015 ARM Ltd
29*09a53ad8SAndrew Turner * All rights reserved.
30*09a53ad8SAndrew Turner *
31*09a53ad8SAndrew Turner * Redistribution and use in source and binary forms, with or without
32*09a53ad8SAndrew Turner * modification, are permitted provided that the following conditions
33*09a53ad8SAndrew Turner * are met:
34*09a53ad8SAndrew Turner * 1. Redistributions of source code must retain the above copyright
35*09a53ad8SAndrew Turner *    notice, this list of conditions and the following disclaimer.
36*09a53ad8SAndrew Turner * 2. Redistributions in binary form must reproduce the above copyright
37*09a53ad8SAndrew Turner *    notice, this list of conditions and the following disclaimer in the
38*09a53ad8SAndrew Turner *    documentation and/or other materials provided with the distribution.
39*09a53ad8SAndrew Turner * 3. The name of the company may not be used to endorse or promote
40*09a53ad8SAndrew Turner *    products derived from this software without specific prior written
41*09a53ad8SAndrew Turner *    permission.
42*09a53ad8SAndrew Turner *
43*09a53ad8SAndrew Turner * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44*09a53ad8SAndrew Turner * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45*09a53ad8SAndrew Turner * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46*09a53ad8SAndrew Turner * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47*09a53ad8SAndrew Turner * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48*09a53ad8SAndrew Turner * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49*09a53ad8SAndrew Turner * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50*09a53ad8SAndrew Turner * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51*09a53ad8SAndrew Turner * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52*09a53ad8SAndrew Turner * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53*09a53ad8SAndrew Turner */
54*09a53ad8SAndrew Turner
55*09a53ad8SAndrew Turner/* Assumptions:
56*09a53ad8SAndrew Turner *
57*09a53ad8SAndrew Turner * ARMv8-a, AArch64, unaligned accesses
58*09a53ad8SAndrew Turner *
59*09a53ad8SAndrew Turner */
60*09a53ad8SAndrew Turner
61*09a53ad8SAndrew Turner
62*09a53ad8SAndrew Turner#define dstin	x0
63*09a53ad8SAndrew Turner#define val	x1
64*09a53ad8SAndrew Turner#define valw	w1
65*09a53ad8SAndrew Turner#define count	x2
66*09a53ad8SAndrew Turner#define dst	x3
67*09a53ad8SAndrew Turner#define dstend	x4
68*09a53ad8SAndrew Turner#define tmp1	x5
69*09a53ad8SAndrew Turner#define tmp1w	w5
70*09a53ad8SAndrew Turner#define tmp2	x6
71*09a53ad8SAndrew Turner#define tmp2w	w6
72*09a53ad8SAndrew Turner#define zva_len x7
73*09a53ad8SAndrew Turner#define zva_lenw w7
74*09a53ad8SAndrew Turner
75*09a53ad8SAndrew Turner#define L(l) .L ## l
76*09a53ad8SAndrew Turner
77*09a53ad8SAndrew Turner	.macro def_fn f p2align=0
78*09a53ad8SAndrew Turner	.text
79*09a53ad8SAndrew Turner	.p2align \p2align
80*09a53ad8SAndrew Turner	.global \f
81*09a53ad8SAndrew Turner	.type \f, %function
82*09a53ad8SAndrew Turner\f:
83*09a53ad8SAndrew Turner	.endm
84*09a53ad8SAndrew Turner
85*09a53ad8SAndrew Turnerdef_fn memset p2align=6
86*09a53ad8SAndrew Turner
87*09a53ad8SAndrew Turner	dup	v0.16B, valw
88*09a53ad8SAndrew Turner	add	dstend, dstin, count
89*09a53ad8SAndrew Turner
90*09a53ad8SAndrew Turner	cmp	count, 96
91*09a53ad8SAndrew Turner	b.hi	L(set_long)
92*09a53ad8SAndrew Turner	cmp	count, 16
93*09a53ad8SAndrew Turner	b.hs	L(set_medium)
94*09a53ad8SAndrew Turner	mov	val, v0.D[0]
95*09a53ad8SAndrew Turner
96*09a53ad8SAndrew Turner	/* Set 0..15 bytes.  */
97*09a53ad8SAndrew Turner	tbz	count, 3, 1f
98*09a53ad8SAndrew Turner	str	val, [dstin]
99*09a53ad8SAndrew Turner	str	val, [dstend, -8]
100*09a53ad8SAndrew Turner	ret
101*09a53ad8SAndrew Turner	nop
102*09a53ad8SAndrew Turner1:	tbz	count, 2, 2f
103*09a53ad8SAndrew Turner	str	valw, [dstin]
104*09a53ad8SAndrew Turner	str	valw, [dstend, -4]
105*09a53ad8SAndrew Turner	ret
106*09a53ad8SAndrew Turner2:	cbz	count, 3f
107*09a53ad8SAndrew Turner	strb	valw, [dstin]
108*09a53ad8SAndrew Turner	tbz	count, 1, 3f
109*09a53ad8SAndrew Turner	strh	valw, [dstend, -2]
110*09a53ad8SAndrew Turner3:	ret
111*09a53ad8SAndrew Turner
112*09a53ad8SAndrew Turner	/* Set 17..96 bytes.  */
113*09a53ad8SAndrew TurnerL(set_medium):
114*09a53ad8SAndrew Turner	str	q0, [dstin]
115*09a53ad8SAndrew Turner	tbnz	count, 6, L(set96)
116*09a53ad8SAndrew Turner	str	q0, [dstend, -16]
117*09a53ad8SAndrew Turner	tbz	count, 5, 1f
118*09a53ad8SAndrew Turner	str	q0, [dstin, 16]
119*09a53ad8SAndrew Turner	str	q0, [dstend, -32]
120*09a53ad8SAndrew Turner1:	ret
121*09a53ad8SAndrew Turner
122*09a53ad8SAndrew Turner	.p2align 4
123*09a53ad8SAndrew Turner	/* Set 64..96 bytes.  Write 64 bytes from the start and
124*09a53ad8SAndrew Turner	   32 bytes from the end.  */
125*09a53ad8SAndrew TurnerL(set96):
126*09a53ad8SAndrew Turner	str	q0, [dstin, 16]
127*09a53ad8SAndrew Turner	stp	q0, q0, [dstin, 32]
128*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -32]
129*09a53ad8SAndrew Turner	ret
130*09a53ad8SAndrew Turner
131*09a53ad8SAndrew Turner	.p2align 3
132*09a53ad8SAndrew Turner	nop
133*09a53ad8SAndrew TurnerL(set_long):
134*09a53ad8SAndrew Turner	and	valw, valw, 255
135*09a53ad8SAndrew Turner	bic	dst, dstin, 15
136*09a53ad8SAndrew Turner	str	q0, [dstin]
137*09a53ad8SAndrew Turner	cmp	count, 256
138*09a53ad8SAndrew Turner	ccmp	valw, 0, 0, cs
139*09a53ad8SAndrew Turner	b.eq	L(try_zva)
140*09a53ad8SAndrew TurnerL(no_zva):
141*09a53ad8SAndrew Turner	sub	count, dstend, dst	/* Count is 16 too large.  */
142*09a53ad8SAndrew Turner	add	dst, dst, 16
143*09a53ad8SAndrew Turner	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
144*09a53ad8SAndrew Turner1:	stp	q0, q0, [dst], 64
145*09a53ad8SAndrew Turner	stp	q0, q0, [dst, -32]
146*09a53ad8SAndrew TurnerL(tail64):
147*09a53ad8SAndrew Turner	subs	count, count, 64
148*09a53ad8SAndrew Turner	b.hi	1b
149*09a53ad8SAndrew Turner2:	stp	q0, q0, [dstend, -64]
150*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -32]
151*09a53ad8SAndrew Turner	ret
152*09a53ad8SAndrew Turner
153*09a53ad8SAndrew Turner	.p2align 3
154*09a53ad8SAndrew TurnerL(try_zva):
155*09a53ad8SAndrew Turner	mrs	tmp1, dczid_el0
156*09a53ad8SAndrew Turner	tbnz	tmp1w, 4, L(no_zva)
157*09a53ad8SAndrew Turner	and	tmp1w, tmp1w, 15
158*09a53ad8SAndrew Turner	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
159*09a53ad8SAndrew Turner	b.ne	 L(zva_128)
160*09a53ad8SAndrew Turner
161*09a53ad8SAndrew Turner	/* Write the first and last 64 byte aligned block using stp rather
162*09a53ad8SAndrew Turner	   than using DC ZVA.  This is faster on some cores.
163*09a53ad8SAndrew Turner	 */
164*09a53ad8SAndrew TurnerL(zva_64):
165*09a53ad8SAndrew Turner	str	q0, [dst, 16]
166*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 32]
167*09a53ad8SAndrew Turner	bic	dst, dst, 63
168*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 64]
169*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 96]
170*09a53ad8SAndrew Turner	sub	count, dstend, dst	/* Count is now 128 too large.	*/
171*09a53ad8SAndrew Turner	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
172*09a53ad8SAndrew Turner	add	dst, dst, 128
173*09a53ad8SAndrew Turner	nop
174*09a53ad8SAndrew Turner1:	dc	zva, dst
175*09a53ad8SAndrew Turner	add	dst, dst, 64
176*09a53ad8SAndrew Turner	subs	count, count, 64
177*09a53ad8SAndrew Turner	b.hi	1b
178*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 0]
179*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 32]
180*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -64]
181*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -32]
182*09a53ad8SAndrew Turner	ret
183*09a53ad8SAndrew Turner
184*09a53ad8SAndrew Turner	.p2align 3
185*09a53ad8SAndrew TurnerL(zva_128):
186*09a53ad8SAndrew Turner	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
187*09a53ad8SAndrew Turner	b.ne	L(zva_other)
188*09a53ad8SAndrew Turner
189*09a53ad8SAndrew Turner	str	q0, [dst, 16]
190*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 32]
191*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 64]
192*09a53ad8SAndrew Turner	stp	q0, q0, [dst, 96]
193*09a53ad8SAndrew Turner	bic	dst, dst, 127
194*09a53ad8SAndrew Turner	sub	count, dstend, dst	/* Count is now 128 too large.	*/
195*09a53ad8SAndrew Turner	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
196*09a53ad8SAndrew Turner	add	dst, dst, 128
197*09a53ad8SAndrew Turner1:	dc	zva, dst
198*09a53ad8SAndrew Turner	add	dst, dst, 128
199*09a53ad8SAndrew Turner	subs	count, count, 128
200*09a53ad8SAndrew Turner	b.hi	1b
201*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -128]
202*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -96]
203*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -64]
204*09a53ad8SAndrew Turner	stp	q0, q0, [dstend, -32]
205*09a53ad8SAndrew Turner	ret
206*09a53ad8SAndrew Turner
207*09a53ad8SAndrew TurnerL(zva_other):
208*09a53ad8SAndrew Turner	mov	tmp2w, 4
209*09a53ad8SAndrew Turner	lsl	zva_lenw, tmp2w, tmp1w
210*09a53ad8SAndrew Turner	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
211*09a53ad8SAndrew Turner	cmp	count, tmp1
212*09a53ad8SAndrew Turner	blo	L(no_zva)
213*09a53ad8SAndrew Turner
214*09a53ad8SAndrew Turner	sub	tmp2, zva_len, 1
215*09a53ad8SAndrew Turner	add	tmp1, dst, zva_len
216*09a53ad8SAndrew Turner	add	dst, dst, 16
217*09a53ad8SAndrew Turner	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
218*09a53ad8SAndrew Turner	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
219*09a53ad8SAndrew Turner	beq	2f
220*09a53ad8SAndrew Turner1:	stp	q0, q0, [dst], 64
221*09a53ad8SAndrew Turner	stp	q0, q0, [dst, -32]
222*09a53ad8SAndrew Turner	subs	count, count, 64
223*09a53ad8SAndrew Turner	b.hi	1b
224*09a53ad8SAndrew Turner2:	mov	dst, tmp1
225*09a53ad8SAndrew Turner	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
226*09a53ad8SAndrew Turner	subs	count, count, zva_len
227*09a53ad8SAndrew Turner	b.lo	4f
228*09a53ad8SAndrew Turner3:	dc	zva, dst
229*09a53ad8SAndrew Turner	add	dst, dst, zva_len
230*09a53ad8SAndrew Turner	subs	count, count, zva_len
231*09a53ad8SAndrew Turner	b.hs	3b
232*09a53ad8SAndrew Turner4:	add	count, count, zva_len
233*09a53ad8SAndrew Turner	b	L(tail64)
234*09a53ad8SAndrew Turner
235*09a53ad8SAndrew Turner	.size	memset, . - memset
236