xref: /netbsd-src/sys/arch/arm/arm/bcopyinout.S (revision beb9bdb00e5421761976d5c277c0da84fd703f9b)
1/*	$NetBSD: bcopyinout.S,v 1.23 2022/10/20 06:58:38 skrll Exp $	*/
2
3/*
4 * Copyright (c) 2002 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Allen Briggs for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include "opt_multiprocessor.h"
39#include "opt_cpuoptions.h"
40
41#include "assym.h"
42
43#include <machine/asm.h>
44
45#include <arm/locore.h>
46
47#if defined(__XSCALE__) || defined(_ARM_ARCH_6)
48/*
49 * armv6 and v7 have pld and strd so they can use the xscale
50 * bcopyinout as well.
51 */
52#include "bcopyinout_xscale.S"
53#else
54
55RCSID("$NetBSD: bcopyinout.S,v 1.23 2022/10/20 06:58:38 skrll Exp $")
56
57	.text
58	.align	0
59
60#define SAVE_REGS	stmfd	sp!, {r4-r11}
61#define RESTORE_REGS	ldmfd	sp!, {r4-r11}
62
63#if defined(__XSCALE__) || defined(_ARM_ARCH_6)
64#define HELLOCPP #
65#define PREFETCH(rx,o)	pld	[ rx , HELLOCPP (o) ]
66#else
67#define PREFETCH(rx,o)
68#endif
69
70/*
71 * r0 = user space address
72 * r1 = kernel space address
73 * r2 = length
74 *
75 * Copies bytes from user space to kernel space
76 *
77 * We save/restore r4-r11:
78 * r4-r11 are scratch
79 */
80ENTRY(copyin)
81	/* Quick exit if length is zero */
82	teq	r2, #0
83	moveq	r0, #0
84	RETc(eq)
85
86	SAVE_REGS
87	GET_CURPCB(r4)
88
89	ldr	r5, [r4, #PCB_ONFAULT]
90	adr	r3, .Lcopyfault
91	str	r3, [r4, #PCB_ONFAULT]
92
93	PREFETCH(r0, 0)
94	PREFETCH(r1, 0)
95
96	/*
97	 * If not too many bytes, take the slow path.
98	 */
99	cmp	r2, #0x08
100	blt	.Licleanup
101
102	/*
103	 * Align destination to word boundary.
104	 */
105	and	r6, r1, #0x3
106	ldr	pc, [pc, r6, lsl #2]
107	b	.Lialend
108	.word	.Lialend
109	.word	.Lial3
110	.word	.Lial2
111	.word	.Lial1
112.Lial3:	ldrbt	r6, [r0], #1
113	sub	r2, r2, #1
114	strb	r6, [r1], #1
115.Lial2:	ldrbt	r7, [r0], #1
116	sub	r2, r2, #1
117	strb	r7, [r1], #1
118.Lial1:	ldrbt	r6, [r0], #1
119	sub	r2, r2, #1
120	strb	r6, [r1], #1
121.Lialend:
122
123	/*
124	 * If few bytes left, finish slow.
125	 */
126	cmp	r2, #0x08
127	blt	.Licleanup
128
129	/*
130	 * If source is not aligned, finish slow.
131	 */
132	ands	r3, r0, #0x03
133	bne	.Licleanup
134
135	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
136	blt	.Licleanup8
137
138	/*
139	 * Align destination to cacheline boundary.
140	 * If source and destination are nicely aligned, this can be a big
141	 * win.  If not, it's still cheaper to copy in groups of 32 even if
142	 * we don't get the nice cacheline alignment.
143	 */
144	and	r6, r1, #0x1f
145	ldr	pc, [pc, r6]
146	b	.Licaligned
147	.word	.Licaligned
148	.word	.Lical28
149	.word	.Lical24
150	.word	.Lical20
151	.word	.Lical16
152	.word	.Lical12
153	.word	.Lical8
154	.word	.Lical4
155.Lical28:ldrt	r6, [r0], #4
156	sub	r2, r2, #4
157	str	r6, [r1], #4
158.Lical24:ldrt	r7, [r0], #4
159	sub	r2, r2, #4
160	str	r7, [r1], #4
161.Lical20:ldrt	r6, [r0], #4
162	sub	r2, r2, #4
163	str	r6, [r1], #4
164.Lical16:ldrt	r7, [r0], #4
165	sub	r2, r2, #4
166	str	r7, [r1], #4
167.Lical12:ldrt	r6, [r0], #4
168	sub	r2, r2, #4
169	str	r6, [r1], #4
170.Lical8:ldrt	r7, [r0], #4
171	sub	r2, r2, #4
172	str	r7, [r1], #4
173.Lical4:ldrt	r6, [r0], #4
174	sub	r2, r2, #4
175	str	r6, [r1], #4
176
177	/*
178	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
179	 * part of the code, and we may have knocked that down by as much
180	 * as 0x1c getting aligned).
181	 *
182	 * This loop basically works out to:
183	 * do {
184	 * 	prefetch-next-cacheline(s)
185	 *	bytes -= 0x20;
186	 *	copy cacheline
187	 * } while (bytes >= 0x40);
188	 * bytes -= 0x20;
189	 * copy cacheline
190	 */
191.Licaligned:
192	PREFETCH(r0, 32)
193	PREFETCH(r1, 32)
194
195	sub	r2, r2, #0x20
196
197	/* Copy a cacheline */
198	ldrt	r10, [r0], #4
199	ldrt	r11, [r0], #4
200	ldrt	r6, [r0], #4
201	ldrt	r7, [r0], #4
202	ldrt	r8, [r0], #4
203	ldrt	r9, [r0], #4
204	stmia	r1!, {r10-r11}
205	ldrt	r10, [r0], #4
206	ldrt	r11, [r0], #4
207	stmia	r1!, {r6-r11}
208
209	cmp	r2, #0x40
210	bge	.Licaligned
211
212	sub	r2, r2, #0x20
213
214	/* Copy a cacheline */
215	ldrt	r10, [r0], #4
216	ldrt	r11, [r0], #4
217	ldrt	r6, [r0], #4
218	ldrt	r7, [r0], #4
219	ldrt	r8, [r0], #4
220	ldrt	r9, [r0], #4
221	stmia	r1!, {r10-r11}
222	ldrt	r10, [r0], #4
223	ldrt	r11, [r0], #4
224	stmia	r1!, {r6-r11}
225
226	cmp	r2, #0x08
227	blt	.Liprecleanup
228
229.Licleanup8:
230	ldrt	r8, [r0], #4
231	ldrt	r9, [r0], #4
232	sub	r2, r2, #8
233	stmia	r1!, {r8, r9}
234	cmp	r2, #8
235	bge	.Licleanup8
236
237.Liprecleanup:
238	/*
239	 * If we're done, bail.
240	 */
241	cmp	r2, #0
242	beq	.Liout
243
244.Licleanup:
245	and	r6, r2, #0x3
246	ldr	pc, [pc, r6, lsl #2]
247	b	.Licend
248	.word	.Lic4
249	.word	.Lic1
250	.word	.Lic2
251	.word	.Lic3
252.Lic4:	ldrbt	r6, [r0], #1
253	sub	r2, r2, #1
254	strb	r6, [r1], #1
255.Lic3:	ldrbt	r7, [r0], #1
256	sub	r2, r2, #1
257	strb	r7, [r1], #1
258.Lic2:	ldrbt	r6, [r0], #1
259	sub	r2, r2, #1
260	strb	r6, [r1], #1
261.Lic1:	ldrbt	r7, [r0], #1
262	subs	r2, r2, #1
263	strb	r7, [r1], #1
264.Licend:
265	bne	.Licleanup
266
267.Liout:
268	mov	r0, #0
269
270	str	r5, [r4, #PCB_ONFAULT]
271	RESTORE_REGS
272
273	RET
274
275.Lcopyfault:
276	str	r5, [r4, #PCB_ONFAULT]
277	RESTORE_REGS
278
279	RET
280END(copyin)
281
282/*
283 * r0 = kernel space address
284 * r1 = user space address
285 * r2 = length
286 *
287 * Copies bytes from kernel space to user space
288 *
289 * We save/restore r4-r11:
290 * r4-r11 are scratch
291 */
292
293ENTRY(copyout)
294	/* Quick exit if length is zero */
295	teq	r2, #0
296	moveq	r0, #0
297	moveq	pc, lr
298
299	SAVE_REGS
300	GET_CURPCB(r4)
301
302	ldr	r5, [r4, #PCB_ONFAULT]
303	adr	r3, .Lcopyfault
304	str	r3, [r4, #PCB_ONFAULT]
305
306	PREFETCH(r0, 0)
307	PREFETCH(r1, 0)
308
309	/*
310	 * If not too many bytes, take the slow path.
311	 */
312	cmp	r2, #0x08
313	blt	.Lcleanup
314
315	/*
316	 * Align destination to word boundary.
317	 */
318	and	r6, r1, #0x3
319	ldr	pc, [pc, r6, lsl #2]
320	b	.Lalend
321	.word	.Lalend
322	.word	.Lal3
323	.word	.Lal2
324	.word	.Lal1
325.Lal3:	ldrb	r6, [r0], #1
326	sub	r2, r2, #1
327	strbt	r6, [r1], #1
328.Lal2:	ldrb	r7, [r0], #1
329	sub	r2, r2, #1
330	strbt	r7, [r1], #1
331.Lal1:	ldrb	r6, [r0], #1
332	sub	r2, r2, #1
333	strbt	r6, [r1], #1
334.Lalend:
335
336	/*
337	 * If few bytes left, finish slow.
338	 */
339	cmp	r2, #0x08
340	blt	.Lcleanup
341
342	/*
343	 * If source is not aligned, finish slow.
344	 */
345	ands	r3, r0, #0x03
346	bne	.Lcleanup
347
348	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
349	blt	.Lcleanup8
350
351	/*
352	 * Align source & destination to cacheline boundary.
353	 */
354	and	r6, r1, #0x1f
355	ldr	pc, [pc, r6]
356	b	.Lcaligned
357	.word	.Lcaligned
358	.word	.Lcal28
359	.word	.Lcal24
360	.word	.Lcal20
361	.word	.Lcal16
362	.word	.Lcal12
363	.word	.Lcal8
364	.word	.Lcal4
365.Lcal28:ldr	r6, [r0], #4
366	sub	r2, r2, #4
367	strt	r6, [r1], #4
368.Lcal24:ldr	r7, [r0], #4
369	sub	r2, r2, #4
370	strt	r7, [r1], #4
371.Lcal20:ldr	r6, [r0], #4
372	sub	r2, r2, #4
373	strt	r6, [r1], #4
374.Lcal16:ldr	r7, [r0], #4
375	sub	r2, r2, #4
376	strt	r7, [r1], #4
377.Lcal12:ldr	r6, [r0], #4
378	sub	r2, r2, #4
379	strt	r6, [r1], #4
380.Lcal8:	ldr	r7, [r0], #4
381	sub	r2, r2, #4
382	strt	r7, [r1], #4
383.Lcal4:	ldr	r6, [r0], #4
384	sub	r2, r2, #4
385	strt	r6, [r1], #4
386
387	/*
388	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
389	 * part of the code, and we may have knocked that down by as much
390	 * as 0x1c getting aligned).
391	 *
392	 * This loop basically works out to:
393	 * do {
394	 * 	prefetch-next-cacheline(s)
395	 *	bytes -= 0x20;
396	 *	copy cacheline
397	 * } while (bytes >= 0x40);
398	 * bytes -= 0x20;
399	 * copy cacheline
400	 */
401.Lcaligned:
402	PREFETCH(r0, 32)
403	PREFETCH(r1, 32)
404
405	sub	r2, r2, #0x20
406
407	/* Copy a cacheline */
408	ldmia	r0!, {r6-r11}
409	strt	r6, [r1], #4
410	strt	r7, [r1], #4
411	ldmia	r0!, {r6-r7}
412	strt	r8, [r1], #4
413	strt	r9, [r1], #4
414	strt	r10, [r1], #4
415	strt	r11, [r1], #4
416	strt	r6, [r1], #4
417	strt	r7, [r1], #4
418
419	cmp	r2, #0x40
420	bge	.Lcaligned
421
422	sub	r2, r2, #0x20
423
424	/* Copy a cacheline */
425	ldmia	r0!, {r6-r11}
426	strt	r6, [r1], #4
427	strt	r7, [r1], #4
428	ldmia	r0!, {r6-r7}
429	strt	r8, [r1], #4
430	strt	r9, [r1], #4
431	strt	r10, [r1], #4
432	strt	r11, [r1], #4
433	strt	r6, [r1], #4
434	strt	r7, [r1], #4
435
436	cmp	r2, #0x08
437	blt	.Lprecleanup
438
439.Lcleanup8:
440	ldmia	r0!, {r8-r9}
441	sub	r2, r2, #8
442	strt	r8, [r1], #4
443	strt	r9, [r1], #4
444	cmp	r2, #8
445	bge	.Lcleanup8
446
447.Lprecleanup:
448	/*
449	 * If we're done, bail.
450	 */
451	cmp	r2, #0
452	beq	.Lout
453
454.Lcleanup:
455	and	r6, r2, #0x3
456	ldr	pc, [pc, r6, lsl #2]
457	b	.Lcend
458	.word	.Lc4
459	.word	.Lc1
460	.word	.Lc2
461	.word	.Lc3
462.Lc4:	ldrb	r6, [r0], #1
463	sub	r2, r2, #1
464	strbt	r6, [r1], #1
465.Lc3:	ldrb	r7, [r0], #1
466	sub	r2, r2, #1
467	strbt	r7, [r1], #1
468.Lc2:	ldrb	r6, [r0], #1
469	sub	r2, r2, #1
470	strbt	r6, [r1], #1
471.Lc1:	ldrb	r7, [r0], #1
472	subs	r2, r2, #1
473	strbt	r7, [r1], #1
474.Lcend:
475	bne	.Lcleanup
476
477.Lout:
478	mov	r0, #0
479
480	str	r5, [r4, #PCB_ONFAULT]
481	RESTORE_REGS
482
483	RET
484END(copyout)
485
486/*
487 * r0 = kernel space source address
488 * r1 = kernel space destination address
489 * r2 = length
490 *
491 * Copies bytes from kernel space to kernel space, aborting on page fault
492 *
493 * Copy of copyout, but without the ldrt/strt instructions.
494 */
495
496ENTRY(kcopy)
497	/* Quick exit if length is zero */
498	teq	r2, #0
499	moveq	r0, #0
500	moveq	pc, lr
501
502	SAVE_REGS
503	GET_CURPCB(r4)
504
505	ldr	r5, [r4, #PCB_ONFAULT]
506	adr	r3, .Lcopyfault
507	str	r3, [r4, #PCB_ONFAULT]
508
509	PREFETCH(r0, 0)
510	PREFETCH(r1, 0)
511
512	/*
513	 * If not too many bytes, take the slow path.
514	 */
515	cmp	r2, #0x08
516	blt	.Lkcleanup
517
518	/*
519	 * Align destination to word boundary.
520	 */
521	and	r6, r1, #0x3
522	ldr	pc, [pc, r6, lsl #2]
523	b	.Lkalend
524	.word	.Lkalend
525	.word	.Lkal3
526	.word	.Lkal2
527	.word	.Lkal1
528.Lkal3:	ldrb	r6, [r0], #1
529	sub	r2, r2, #1
530	strb	r6, [r1], #1
531.Lkal2:	ldrb	r7, [r0], #1
532	sub	r2, r2, #1
533	strb	r7, [r1], #1
534.Lkal1:	ldrb	r6, [r0], #1
535	sub	r2, r2, #1
536	strb	r6, [r1], #1
537.Lkalend:
538
539	/*
540	 * If few bytes left, finish slow.
541	 */
542	cmp	r2, #0x08
543	blt	.Lkcleanup
544
545	/*
546	 * If source is not aligned, finish slow.
547	 */
548	ands	r3, r0, #0x03
549	bne	.Lkcleanup
550
551	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
552	blt	.Lkcleanup8
553
554	/*
555	 * Align source & destination to cacheline boundary.
556	 */
557	and	r6, r1, #0x1f
558	ldr	pc, [pc, r6]
559	b	.Lkcaligned
560	.word	.Lkcaligned
561	.word	.Lkcal28
562	.word	.Lkcal24
563	.word	.Lkcal20
564	.word	.Lkcal16
565	.word	.Lkcal12
566	.word	.Lkcal8
567	.word	.Lkcal4
568.Lkcal28:ldr	r6, [r0], #4
569	sub	r2, r2, #4
570	str	r6, [r1], #4
571.Lkcal24:ldr	r7, [r0], #4
572	sub	r2, r2, #4
573	str	r7, [r1], #4
574.Lkcal20:ldr	r6, [r0], #4
575	sub	r2, r2, #4
576	str	r6, [r1], #4
577.Lkcal16:ldr	r7, [r0], #4
578	sub	r2, r2, #4
579	str	r7, [r1], #4
580.Lkcal12:ldr	r6, [r0], #4
581	sub	r2, r2, #4
582	str	r6, [r1], #4
583.Lkcal8:ldr	r7, [r0], #4
584	sub	r2, r2, #4
585	str	r7, [r1], #4
586.Lkcal4:ldr	r6, [r0], #4
587	sub	r2, r2, #4
588	str	r6, [r1], #4
589
590	/*
591	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
592	 * part of the code, and we may have knocked that down by as much
593	 * as 0x1c getting aligned).
594	 *
595	 * This loop basically works out to:
596	 * do {
597	 * 	prefetch-next-cacheline(s)
598	 *	bytes -= 0x20;
599	 *	copy cacheline
600	 * } while (bytes >= 0x40);
601	 * bytes -= 0x20;
602	 * copy cacheline
603	 */
604.Lkcaligned:
605	PREFETCH(r0, 32)
606	PREFETCH(r1, 32)
607
608	sub	r2, r2, #0x20
609
610	/* Copy a cacheline */
611	ldmia	r0!, {r6-r11}
612	stmia	r1!, {r6, r7}
613	ldmia	r0!, {r6, r7}
614	stmia	r1!, {r8-r11}
615	stmia	r1!, {r6, r7}
616
617	cmp	r2, #0x40
618	bge	.Lkcaligned
619
620	sub	r2, r2, #0x20
621
622	/* Copy a cacheline */
623	ldmia	r0!, {r6-r11}
624	stmia	r1!, {r6-r7}
625	ldmia	r0!, {r6-r7}
626	stmia	r1!, {r8-r11}
627	stmia	r1!, {r6-r7}
628
629	cmp	r2, #0x08
630	blt	.Lkprecleanup
631
632.Lkcleanup8:
633	ldmia	r0!, {r8-r9}
634	sub	r2, r2, #8
635	stmia	r1!, {r8-r9}
636	cmp	r2, #8
637	bge	.Lkcleanup8
638
639.Lkprecleanup:
640	/*
641	 * If we're done, bail.
642	 */
643	cmp	r2, #0
644	beq	.Lkout
645
646.Lkcleanup:
647	and	r6, r2, #0x3
648	ldr	pc, [pc, r6, lsl #2]
649	b	.Lkcend
650	.word	.Lkc4
651	.word	.Lkc1
652	.word	.Lkc2
653	.word	.Lkc3
654.Lkc4:	ldrb	r6, [r0], #1
655	sub	r2, r2, #1
656	strb	r6, [r1], #1
657.Lkc3:	ldrb	r7, [r0], #1
658	sub	r2, r2, #1
659	strb	r7, [r1], #1
660.Lkc2:	ldrb	r6, [r0], #1
661	sub	r2, r2, #1
662	strb	r6, [r1], #1
663.Lkc1:	ldrb	r7, [r0], #1
664	subs	r2, r2, #1
665	strb	r7, [r1], #1
666.Lkcend:
667	bne	.Lkcleanup
668
669.Lkout:
670	mov	r0, #0
671
672	str	r5, [r4, #PCB_ONFAULT]
673	RESTORE_REGS
674
675	RET
676END(kcopy)
677#endif	/* !__XSCALE__ */
678
679/*
680 * int badaddr_read_1(const uint8_t *src, uint8_t *dest)
681 *
682 * Copies a single 8-bit value from src to dest, returning 0 on success,
683 * else EFAULT if a page fault occurred.
684 */
685ENTRY(badaddr_read_1)
686	GET_CURPCB(r2)
687	ldr	ip, [r2, #PCB_ONFAULT]
688	adr	r3, 1f
689	str	r3, [r2, #PCB_ONFAULT]
690	nop
691	nop
692	nop
693	ldrb	r3, [r0]
694	nop
695	nop
696	nop
697	strb	r3, [r1]
698	mov	r0, #0		/* No fault */
6991:	str	ip, [r2, #PCB_ONFAULT]
700	RET
701END(badaddr_read_1)
702
703/*
704 * int badaddr_read_2(const uint16_t *src, uint16_t *dest)
705 *
706 * Copies a single 16-bit value from src to dest, returning 0 on success,
707 * else EFAULT if a page fault occurred.
708 */
709ENTRY(badaddr_read_2)
710	GET_CURPCB(r2)
711	ldr	ip, [r2, #PCB_ONFAULT]
712	adr	r3, 1f
713	str	r3, [r2, #PCB_ONFAULT]
714	nop
715	nop
716	nop
717	ldrh	r3, [r0]
718	nop
719	nop
720	nop
721	strh	r3, [r1]
722	mov	r0, #0		/* No fault */
7231:	str	ip, [r2, #PCB_ONFAULT]
724	RET
725END(badaddr_read_2)
726
727/*
728 * int badaddr_read_4(const uint32_t *src, uint32_t *dest)
729 *
730 * Copies a single 32-bit value from src to dest, returning 0 on success,
731 * else EFAULT if a page fault occurred.
732 */
733ENTRY(badaddr_read_4)
734	GET_CURPCB(r2)
735	ldr	ip, [r2, #PCB_ONFAULT]
736	adr	r3, 1f
737	str	r3, [r2, #PCB_ONFAULT]
738	nop
739	nop
740	nop
741	ldr	r3, [r0]
742	nop
743	nop
744	nop
745	str	r3, [r1]
746	mov	r0, #0		/* No fault */
7471:	str	ip, [r2, #PCB_ONFAULT]
748	RET
749END(badaddr_read_4)
750