xref: /netbsd-src/sys/arch/arm/arm/blockio.S (revision beb9bdb00e5421761976d5c277c0da84fd703f9b)
1/*	$NetBSD: blockio.S,v 1.9 2022/10/20 06:58:38 skrll Exp $	*/
2
3/*
4 * Copyright (c) 2001 Ben Harris.
5 * Copyright (c) 1994 Mark Brinicombe.
6 * Copyright (c) 1994 Brini.
7 * All rights reserved.
8 *
9 * This code is derived from software written for Brini by Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by Brini.
22 * 4. The name of the company nor the name of the author may be used to
23 *    endorse or promote products derived from this software without specific
24 *    prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * RiscBSD kernel project
39 *
40 * blockio.S
41 *
42 * optimised block read/write from/to IO routines.
43 *
44 * Created      : 08/10/94
45 * Modified	: 22/01/99  -- R.Earnshaw
46 *			       Faster, and small tweaks for StrongARM
47 */
48
49#include <machine/asm.h>
50
51RCSID("$NetBSD: blockio.S,v 1.9 2022/10/20 06:58:38 skrll Exp $")
52
53/*
54 * Read bytes from an I/O address into a block of memory
55 *
56 * r0 = address to read from (IO)
57 * r1 = address to write to (memory)
58 * r2 = length
59 */
60
61/* This code will look very familiar if you've read _memcpy(). */
62ENTRY(read_multi_1)
63	mov	ip, sp
64	push	{fp, ip, lr, pc}
65	sub	fp, ip, #4
66	subs	r2, r2, #4		/* r2 = length - 4 */
67	blt	.Lrm1_l4			/* less than 4 bytes */
68	ands	ip, r1, #3
69	beq	.Lrm1_main		/* aligned destination */
70	rsb	ip, ip, #4
71	cmp	ip, #2
72	ldrb	r3, [r0]
73	strb	r3, [r1], #1
74	ldrbge	r3, [r0]
75	strbge	r3, [r1], #1
76	ldrbgt	r3, [r0]
77	strbgt	r3, [r1], #1
78	subs	r2, r2, ip
79	blt	.Lrm1_l4
80.Lrm1_main:
81.Lrm1loop:
82	ldrb	r3, [r0]
83	ldrb	ip, [r0]
84	orr	r3, r3, ip, lsl #8
85	ldrb	ip, [r0]
86	orr	r3, r3, ip, lsl #16
87	ldrb	ip, [r0]
88	orr	r3, r3, ip, lsl #24
89	str	r3, [r1], #4
90	subs	r2, r2, #4
91	bge	.Lrm1loop
92.Lrm1_l4:
93	adds	r2, r2, #4			/* r2 = length again */
94	ldmdbeq	fp, {fp, sp, pc}
95	RETc(eq)				/* ??? not needed */
96	cmp	r2, #2
97	ldrb	r3, [r0]
98	strb	r3, [r1], #1
99	ldrbge	r3, [r0]
100	strbge	r3, [r1], #1
101	ldrbgt	r3, [r0]
102	strbgt	r3, [r1], #1
103	ldmdb	fp, {fp, sp, pc}
104END(read_multi_1)
105
106/*
107 * Write bytes to an I/O address from a block of memory
108 *
109 * r0 = address to write to (IO)
110 * r1 = address to read from (memory)
111 * r2 = length
112 */
113
114/* This code will look very familiar if you've read _memcpy(). */
115ENTRY(write_multi_1)
116	mov	ip, sp
117	push	{fp, ip, lr, pc}
118	sub	fp, ip, #4
119	subs	r2, r2, #4		/* r2 = length - 4 */
120	blt	.Lwm1_l4		/* less than 4 bytes */
121	ands	ip, r1, #3
122	beq	.Lwm1_main		/* aligned source */
123	rsb	ip, ip, #4
124	cmp	ip, #2
125	ldrb	r3, [r1], #1
126	strb	r3, [r0]
127	ldrbge	r3, [r1], #1
128	strbge	r3, [r0]
129	ldrbgt	r3, [r1], #1
130	strbgt	r3, [r0]
131	subs	r2, r2, ip
132	blt	.Lwm1_l4
133.Lwm1_main:
134.Lwm1loop:
135	ldr	r3, [r1], #4
136	strb	r3, [r0]
137	mov	r3, r3, lsr #8
138	strb	r3, [r0]
139	mov	r3, r3, lsr #8
140	strb	r3, [r0]
141	mov	r3, r3, lsr #8
142	strb	r3, [r0]
143	subs	r2, r2, #4
144	bge	.Lwm1loop
145.Lwm1_l4:
146	adds	r2, r2, #4			/* r2 = length again */
147	ldmdbeq	fp, {fp, sp, pc}
148	cmp	r2, #2
149	ldrb	r3, [r1], #1
150	strb	r3, [r0]
151	ldrbge	r3, [r1], #1
152	strbge	r3, [r0]
153	ldrbgt	r3, [r1], #1
154	strbgt	r3, [r0]
155	ldmdb	fp, {fp, sp, pc}
156END(write_multi_1)
157
158/*
159 * Reads short ints (16 bits) from an I/O address into a block of memory
160 *
161 * r0 = address to read from (IO)
162 * r1 = address to write to (memory)
163 * r2 = length
164 */
165
166ENTRY(insw)
167/* Make sure that we have a positive length */
168	cmp	r2, #0x00000000
169	RETc(le)
170
171/* If the destination address and the size is word aligned, do it fast */
172
173	tst	r2, #0x00000001
174	tsteq	r1, #0x00000003
175	beq	.Lfastinsw
176
177/* Non aligned insw */
178
179.Linswloop:
180	ldr	r3, [r0]
181	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
182	strb	r3, [r1], #0x0001
183	mov	r3, r3, lsr #8
184	strb	r3, [r1], #0x0001
185	bgt	.Linswloop
186
187	RET
188
189/* Word aligned insw */
190
191.Lfastinsw:
192
193.Lfastinswloop:
194	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
195					 * word accesses */
196	ldr	ip, [r0]
197	mov	r3, r3, lsr #16		/* Put the two shorts together */
198	orr	r3, r3, ip, lsl #16
199	str	r3, [r1], #0x0004	/* Store */
200	subs	r2, r2, #0x00000002	/* Next */
201	bgt	.Lfastinswloop
202
203	RET
204END(insw)
205
206
207/*
208 * Writes short ints (16 bits) from a block of memory to an I/O address
209 *
210 * r0 = address to write to (IO)
211 * r1 = address to read from (memory)
212 * r2 = length
213 */
214
215ENTRY(outsw)
216/* Make sure that we have a positive length */
217	cmp	r2, #0x00000000
218	RETc(le)
219
220/* If the destination address and the size is word aligned, do it fast */
221
222	tst	r2, #0x00000001
223	tsteq	r1, #0x00000003
224	beq	.Lfastoutsw
225
226/* Non aligned outsw */
227
228.Loutswloop:
229	ldrb	r3, [r1], #0x0001
230	ldrb	ip, [r1], #0x0001
231	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
232	orr	r3, r3, ip, lsl #8
233	orr	r3, r3, r3, lsl #16
234	str	r3, [r0]
235	bgt	.Loutswloop
236
237	RET
238
239/* Word aligned outsw */
240
241.Lfastoutsw:
242
243.Lfastoutswloop:
244	ldr	r3, [r1], #0x0004	/* r3 = (H)(L) */
245	subs	r2, r2, #0x00000002	/* Loop test in load delay slot */
246
247	eor	ip, r3, r3, lsr #16	/* ip = (H)(H^L) */
248	eor	r3, r3, ip, lsl #16	/* r3 = (H^H^L)(L) = (L)(L) */
249	eor	ip, ip, r3, lsr #16	/* ip = (H)(H^L^L) = (H)(H) */
250
251	str	r3, [r0]
252	str	ip, [r0]
253
254/*	mov	ip, r3, lsl #16
255 *	orr	ip, ip, ip, lsr #16
256 *	str	ip, [r0]
257 *
258 *	mov	ip, r3, lsr #16
259 *	orr	ip, ip, ip, lsl #16
260 *	str	ip, [r0]
261 */
262
263	bgt	.Lfastoutswloop
264
265	RET
266END(outsw)
267
268/*
269 * reads short ints (16 bits) from an I/O address into a block of memory
270 * with a length garenteed to be a multiple of 16 bytes
271 * with a word aligned destination address
272 *
273 * r0 = address to read from (IO)
274 * r1 = address to write to (memory)
275 * r2 = length
276 */
277
278ENTRY(insw16)
279/* Make sure that we have a positive length */
280	cmp	r2, #0x00000000
281	RETc(le)
282
283/* If the destination address is word aligned and the size suitably
284   aligned, do it fast */
285
286	tst	r2, #0x00000007
287	tsteq	r1, #0x00000003
288
289	bne	_C_LABEL(insw)
290
291/* Word aligned insw */
292
293	push	{r4,r5,lr}
294
295.Linsw16loop:
296	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
297					 * word accesses */
298	ldr	lr, [r0]
299	mov	r3, r3, lsr #16		/* Put the two shorts together */
300	orr	r3, r3, lr, lsl #16
301
302	ldr	r4, [r0, #0x0002]	/* take advantage of nonaligned
303					 * word accesses */
304	ldr	lr, [r0]
305	mov	r4, r4, lsr #16		/* Put the two shorts together */
306	orr	r4, r4, lr, lsl #16
307
308	ldr	r5, [r0, #0x0002]	/* take advantage of nonaligned
309					 * word accesses */
310	ldr	lr, [r0]
311	mov	r5, r5, lsr #16		/* Put the two shorts together */
312	orr	r5, r5, lr, lsl #16
313
314	ldr	ip, [r0, #0x0002]	/* take advantage of nonaligned
315					 * word accesses */
316	ldr	lr, [r0]
317	mov	ip, ip, lsr #16		/* Put the two shorts together */
318	orr	ip, ip, lr, lsl #16
319
320	stmia	r1!, {r3-r5,ip}
321	subs	r2, r2, #0x00000008	/* Next */
322	bgt	.Linsw16loop
323
324	pop	{r4,r5,pc}		/* Restore regs and go home */
325END(insw16)
326
327
328/*
329 * Writes short ints (16 bits) from a block of memory to an I/O address
330 *
331 * r0 = address to write to (IO)
332 * r1 = address to read from (memory)
333 * r2 = length
334 */
335
336ENTRY(outsw16)
337/* Make sure that we have a positive length */
338	cmp	r2, #0x00000000
339	RETc(le)
340
341/* If the destination address is word aligned and the size suitably
342   aligned, do it fast */
343
344	tst	r2, #0x00000007
345	tsteq	r1, #0x00000003
346
347	bne	_C_LABEL(outsw)
348
349/* Word aligned outsw */
350
351	push	{r4,r5,lr}
352
353.Loutsw16loop:
354	ldmia	r1!, {r4,r5,ip,lr}
355
356	eor	r3, r4, r4, lsl #16	/* r3 = (A^B)(B) */
357	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
358	eor	r3, r3, r4, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
359	str	r3, [r0]
360	str	r4, [r0]
361
362/*	mov	r3, r4, lsl #16
363 *	orr	r3, r3, r3, lsr #16
364 *	str	r3, [r0]
365 *
366 *	mov	r3, r4, lsr #16
367 *	orr	r3, r3, r3, lsl #16
368 *	str	r3, [r0]
369 */
370
371	eor	r3, r5, r5, lsl #16	/* r3 = (A^B)(B) */
372	eor	r5, r5, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
373	eor	r3, r3, r5, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
374	str	r3, [r0]
375	str	r5, [r0]
376
377	eor	r3, ip, ip, lsl #16	/* r3 = (A^B)(B) */
378	eor	ip, ip, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
379	eor	r3, r3, ip, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
380	str	r3, [r0]
381	str	ip, [r0]
382
383	eor	r3, lr, lr, lsl #16	/* r3 = (A^B)(B) */
384	eor	lr, lr, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
385	eor	r3, r3, lr, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
386	str	r3, [r0]
387	str	lr, [r0]
388
389	subs	r2, r2, #0x00000008
390	bgt	.Loutsw16loop
391
392	pop	{r4,r5,pc}		/* and go home */
393END(outsw16)
394
395/*
396 * reads short ints (16 bits) from an I/O address into a block of memory
397 * The I/O address is assumed to be mapped multiple times in a block of
398 * 8 words.
399 * The destination address should be word aligned.
400 *
401 * r0 = address to read from (IO)
402 * r1 = address to write to (memory)
403 * r2 = length
404 */
405
406ENTRY(inswm8)
407/* Make sure that we have a positive length */
408	cmp	r2, #0x00000000
409	RETc(le)
410
411/* If the destination address is word aligned and the size suitably
412   aligned, do it fast */
413
414	tst	r1, #0x00000003
415
416	bne	_C_LABEL(insw)
417
418/* Word aligned insw */
419
420	push	{r4-r9,lr}
421
422	mov	lr, #0xff000000
423	orr	lr, lr, #0x00ff0000
424
425.Linswm8_loop8:
426	cmp	r2, #8
427	bcc	.Linswm8_l8
428
429	ldmia	r0, {r3-r9,ip}
430
431	bic	r3, r3, lr
432	orr	r3, r3, r4, lsl #16
433	bic	r5, r5, lr
434	orr	r4, r5, r6, lsl #16
435	bic	r7, r7, lr
436	orr	r5, r7, r8, lsl #16
437	bic	r9, r9, lr
438	orr	r6, r9, ip, lsl #16
439
440	stmia	r1!, {r3-r6}
441
442	subs	r2, r2, #0x00000008	/* Next */
443	bne	.Linswm8_loop8
444	beq	.Linswm8_l1
445
446.Linswm8_l8:
447	cmp	r2, #4
448	bcc	.Linswm8_l4
449
450	ldmia	r0, {r3-r6}
451
452	bic	r3, r3, lr
453	orr	r3, r3, r4, lsl #16
454	bic	r5, r5, lr
455	orr	r4, r5, r6, lsl #16
456
457	stmia	r1!, {r3-r4}
458
459	subs	r2, r2, #0x00000004
460	beq	.Linswm8_l1
461
462.Linswm8_l4:
463	cmp	r2, #2
464	bcc	.Linswm8_l2
465
466	ldmia	r0, {r3-r4}
467
468	bic	r3, r3, lr
469	orr	r3, r3, r4, lsl #16
470	str	r3, [r1], #0x0004
471
472	subs	r2, r2, #0x00000002
473	beq	.Linswm8_l1
474
475.Linswm8_l2:
476	cmp	r2, #1
477	bcc	.Linswm8_l1
478
479	ldr	r3, [r0]
480	subs	r2, r2, #0x00000001	/* Test in load delay slot */
481					/* XXX, why don't we use result?  */
482
483	strb	r3, [r1], #0x0001
484	mov	r3, r3, lsr #8
485	strb	r3, [r1], #0x0001
486
487
488.Linswm8_l1:
489	pop	{r4-r9,pc}		/* And go home */
490END(inswm8)
491
492/*
493 * write short ints (16 bits) to an I/O address from a block of memory
494 * The I/O address is assumed to be mapped multiple times in a block of
495 * 8 words.
496 * The source address should be word aligned.
497 *
498 * r0 = address to read to (IO)
499 * r1 = address to write from (memory)
500 * r2 = length
501 */
502
503ENTRY(outswm8)
504/* Make sure that we have a positive length */
505	cmp	r2, #0x00000000
506	RETc(le)
507
508/* If the destination address is word aligned and the size suitably
509   aligned, do it fast */
510
511	tst	r1, #0x00000003
512
513	bne	_C_LABEL(outsw)
514
515/* Word aligned outsw */
516
517	push	{r4-r8,lr}
518
519.Loutswm8_loop8:
520	cmp	r2, #8
521	bcc	.Loutswm8_l8
522
523	ldmia	r1!, {r3,r5,r7,ip}
524
525	eor	r4, r3, r3, lsr #16	/* r4 = (A)(A^B) */
526	eor	r3, r3, r4, lsl #16	/* r3 = (A^A^B)(B) = (B)(B) */
527	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
528
529	eor	r6, r5, r5, lsr #16	/* r6 = (A)(A^B) */
530	eor	r5, r5, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
531	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
532
533	eor	r8, r7, r7, lsr #16	/* r8 = (A)(A^B) */
534	eor	r7, r7, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
535	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
536
537	eor	lr, ip, ip, lsr #16	/* lr = (A)(A^B) */
538	eor	ip, ip, lr, lsl #16	/* ip = (A^A^B)(B) = (B)(B) */
539	eor	lr, lr, ip, lsr #16	/* lr = (A)(B^A^B) = (A)(A) */
540
541	stmia	r0, {r3-r8,ip,lr}
542
543	subs	r2, r2, #0x00000008	/* Next */
544	bne	.Loutswm8_loop8
545	beq	.Loutswm8_l1
546
547.Loutswm8_l8:
548	cmp	r2, #4
549	bcc	.Loutswm8_l4
550
551	ldmia	r1!, {r3-r4}
552
553	eor	r6, r3, r3, lsr #16	/* r6 = (A)(A^B) */
554	eor	r5, r3, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
555	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
556
557	eor	r8, r4, r4, lsr #16	/* r8 = (A)(A^B) */
558	eor	r7, r4, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
559	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
560
561	stmia	r0, {r5-r8}
562
563	subs	r2, r2, #0x00000004
564	beq	.Loutswm8_l1
565
566.Loutswm8_l4:
567	cmp	r2, #2
568	bcc	.Loutswm8_l2
569
570	ldr	r3, [r1], #0x0004	/* r3 = (A)(B) */
571	subs	r2, r2, #0x00000002	/* Done test in Load delay slot */
572
573	eor	r5, r3, r3, lsr #16	/* r5 = (A)(A^B)*/
574	eor	r4, r3, r5, lsl #16	/* r4 = (A^A^B)(B) = (B)(B) */
575	eor	r5, r5, r4, lsr #16	/* r5 = (A)(B^A^B) = (A)(A) */
576
577	stmia	r0, {r4, r5}
578
579	beq	.Loutswm8_l1
580
581.Loutswm8_l2:
582	cmp	r2, #1
583	bcc	.Loutswm8_l1
584
585	ldrb	r3, [r1], #0x0001
586	ldrb	r4, [r1], #0x0001
587	subs	r2, r2, #0x00000001	/* Done test in load delay slot */
588					/* XXX This test isn't used?  */
589	orr	r3, r3, r4, lsl #8
590	orr	r3, r3, r3, lsl #16
591	str	r3, [r0]
592
593.Loutswm8_l1:
594	pop	{r4-r8,pc}		/* And go home */
595END(outswm8)
596