1/* $NetBSD: blockio.S,v 1.9 2022/10/20 06:58:38 skrll Exp $ */ 2 3/* 4 * Copyright (c) 2001 Ben Harris. 5 * Copyright (c) 1994 Mark Brinicombe. 6 * Copyright (c) 1994 Brini. 7 * All rights reserved. 8 * 9 * This code is derived from software written for Brini by Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by Brini. 22 * 4. The name of the company nor the name of the author may be used to 23 * endorse or promote products derived from this software without specific 24 * prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED 27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * RiscBSD kernel project 39 * 40 * blockio.S 41 * 42 * optimised block read/write from/to IO routines. 43 * 44 * Created : 08/10/94 45 * Modified : 22/01/99 -- R.Earnshaw 46 * Faster, and small tweaks for StrongARM 47 */ 48 49#include <machine/asm.h> 50 51RCSID("$NetBSD: blockio.S,v 1.9 2022/10/20 06:58:38 skrll Exp $") 52 53/* 54 * Read bytes from an I/O address into a block of memory 55 * 56 * r0 = address to read from (IO) 57 * r1 = address to write to (memory) 58 * r2 = length 59 */ 60 61/* This code will look very familiar if you've read _memcpy(). */ 62ENTRY(read_multi_1) 63 mov ip, sp 64 push {fp, ip, lr, pc} 65 sub fp, ip, #4 66 subs r2, r2, #4 /* r2 = length - 4 */ 67 blt .Lrm1_l4 /* less than 4 bytes */ 68 ands ip, r1, #3 69 beq .Lrm1_main /* aligned destination */ 70 rsb ip, ip, #4 71 cmp ip, #2 72 ldrb r3, [r0] 73 strb r3, [r1], #1 74 ldrbge r3, [r0] 75 strbge r3, [r1], #1 76 ldrbgt r3, [r0] 77 strbgt r3, [r1], #1 78 subs r2, r2, ip 79 blt .Lrm1_l4 80.Lrm1_main: 81.Lrm1loop: 82 ldrb r3, [r0] 83 ldrb ip, [r0] 84 orr r3, r3, ip, lsl #8 85 ldrb ip, [r0] 86 orr r3, r3, ip, lsl #16 87 ldrb ip, [r0] 88 orr r3, r3, ip, lsl #24 89 str r3, [r1], #4 90 subs r2, r2, #4 91 bge .Lrm1loop 92.Lrm1_l4: 93 adds r2, r2, #4 /* r2 = length again */ 94 ldmdbeq fp, {fp, sp, pc} 95 RETc(eq) /* ??? not needed */ 96 cmp r2, #2 97 ldrb r3, [r0] 98 strb r3, [r1], #1 99 ldrbge r3, [r0] 100 strbge r3, [r1], #1 101 ldrbgt r3, [r0] 102 strbgt r3, [r1], #1 103 ldmdb fp, {fp, sp, pc} 104END(read_multi_1) 105 106/* 107 * Write bytes to an I/O address from a block of memory 108 * 109 * r0 = address to write to (IO) 110 * r1 = address to read from (memory) 111 * r2 = length 112 */ 113 114/* This code will look very familiar if you've read _memcpy(). */ 115ENTRY(write_multi_1) 116 mov ip, sp 117 push {fp, ip, lr, pc} 118 sub fp, ip, #4 119 subs r2, r2, #4 /* r2 = length - 4 */ 120 blt .Lwm1_l4 /* less than 4 bytes */ 121 ands ip, r1, #3 122 beq .Lwm1_main /* aligned source */ 123 rsb ip, ip, #4 124 cmp ip, #2 125 ldrb r3, [r1], #1 126 strb r3, [r0] 127 ldrbge r3, [r1], #1 128 strbge r3, [r0] 129 ldrbgt r3, [r1], #1 130 strbgt r3, [r0] 131 subs r2, r2, ip 132 blt .Lwm1_l4 133.Lwm1_main: 134.Lwm1loop: 135 ldr r3, [r1], #4 136 strb r3, [r0] 137 mov r3, r3, lsr #8 138 strb r3, [r0] 139 mov r3, r3, lsr #8 140 strb r3, [r0] 141 mov r3, r3, lsr #8 142 strb r3, [r0] 143 subs r2, r2, #4 144 bge .Lwm1loop 145.Lwm1_l4: 146 adds r2, r2, #4 /* r2 = length again */ 147 ldmdbeq fp, {fp, sp, pc} 148 cmp r2, #2 149 ldrb r3, [r1], #1 150 strb r3, [r0] 151 ldrbge r3, [r1], #1 152 strbge r3, [r0] 153 ldrbgt r3, [r1], #1 154 strbgt r3, [r0] 155 ldmdb fp, {fp, sp, pc} 156END(write_multi_1) 157 158/* 159 * Reads short ints (16 bits) from an I/O address into a block of memory 160 * 161 * r0 = address to read from (IO) 162 * r1 = address to write to (memory) 163 * r2 = length 164 */ 165 166ENTRY(insw) 167/* Make sure that we have a positive length */ 168 cmp r2, #0x00000000 169 RETc(le) 170 171/* If the destination address and the size is word aligned, do it fast */ 172 173 tst r2, #0x00000001 174 tsteq r1, #0x00000003 175 beq .Lfastinsw 176 177/* Non aligned insw */ 178 179.Linswloop: 180 ldr r3, [r0] 181 subs r2, r2, #0x00000001 /* Loop test in load delay slot */ 182 strb r3, [r1], #0x0001 183 mov r3, r3, lsr #8 184 strb r3, [r1], #0x0001 185 bgt .Linswloop 186 187 RET 188 189/* Word aligned insw */ 190 191.Lfastinsw: 192 193.Lfastinswloop: 194 ldr r3, [r0, #0x0002] /* take advantage of nonaligned 195 * word accesses */ 196 ldr ip, [r0] 197 mov r3, r3, lsr #16 /* Put the two shorts together */ 198 orr r3, r3, ip, lsl #16 199 str r3, [r1], #0x0004 /* Store */ 200 subs r2, r2, #0x00000002 /* Next */ 201 bgt .Lfastinswloop 202 203 RET 204END(insw) 205 206 207/* 208 * Writes short ints (16 bits) from a block of memory to an I/O address 209 * 210 * r0 = address to write to (IO) 211 * r1 = address to read from (memory) 212 * r2 = length 213 */ 214 215ENTRY(outsw) 216/* Make sure that we have a positive length */ 217 cmp r2, #0x00000000 218 RETc(le) 219 220/* If the destination address and the size is word aligned, do it fast */ 221 222 tst r2, #0x00000001 223 tsteq r1, #0x00000003 224 beq .Lfastoutsw 225 226/* Non aligned outsw */ 227 228.Loutswloop: 229 ldrb r3, [r1], #0x0001 230 ldrb ip, [r1], #0x0001 231 subs r2, r2, #0x00000001 /* Loop test in load delay slot */ 232 orr r3, r3, ip, lsl #8 233 orr r3, r3, r3, lsl #16 234 str r3, [r0] 235 bgt .Loutswloop 236 237 RET 238 239/* Word aligned outsw */ 240 241.Lfastoutsw: 242 243.Lfastoutswloop: 244 ldr r3, [r1], #0x0004 /* r3 = (H)(L) */ 245 subs r2, r2, #0x00000002 /* Loop test in load delay slot */ 246 247 eor ip, r3, r3, lsr #16 /* ip = (H)(H^L) */ 248 eor r3, r3, ip, lsl #16 /* r3 = (H^H^L)(L) = (L)(L) */ 249 eor ip, ip, r3, lsr #16 /* ip = (H)(H^L^L) = (H)(H) */ 250 251 str r3, [r0] 252 str ip, [r0] 253 254/* mov ip, r3, lsl #16 255 * orr ip, ip, ip, lsr #16 256 * str ip, [r0] 257 * 258 * mov ip, r3, lsr #16 259 * orr ip, ip, ip, lsl #16 260 * str ip, [r0] 261 */ 262 263 bgt .Lfastoutswloop 264 265 RET 266END(outsw) 267 268/* 269 * reads short ints (16 bits) from an I/O address into a block of memory 270 * with a length garenteed to be a multiple of 16 bytes 271 * with a word aligned destination address 272 * 273 * r0 = address to read from (IO) 274 * r1 = address to write to (memory) 275 * r2 = length 276 */ 277 278ENTRY(insw16) 279/* Make sure that we have a positive length */ 280 cmp r2, #0x00000000 281 RETc(le) 282 283/* If the destination address is word aligned and the size suitably 284 aligned, do it fast */ 285 286 tst r2, #0x00000007 287 tsteq r1, #0x00000003 288 289 bne _C_LABEL(insw) 290 291/* Word aligned insw */ 292 293 push {r4,r5,lr} 294 295.Linsw16loop: 296 ldr r3, [r0, #0x0002] /* take advantage of nonaligned 297 * word accesses */ 298 ldr lr, [r0] 299 mov r3, r3, lsr #16 /* Put the two shorts together */ 300 orr r3, r3, lr, lsl #16 301 302 ldr r4, [r0, #0x0002] /* take advantage of nonaligned 303 * word accesses */ 304 ldr lr, [r0] 305 mov r4, r4, lsr #16 /* Put the two shorts together */ 306 orr r4, r4, lr, lsl #16 307 308 ldr r5, [r0, #0x0002] /* take advantage of nonaligned 309 * word accesses */ 310 ldr lr, [r0] 311 mov r5, r5, lsr #16 /* Put the two shorts together */ 312 orr r5, r5, lr, lsl #16 313 314 ldr ip, [r0, #0x0002] /* take advantage of nonaligned 315 * word accesses */ 316 ldr lr, [r0] 317 mov ip, ip, lsr #16 /* Put the two shorts together */ 318 orr ip, ip, lr, lsl #16 319 320 stmia r1!, {r3-r5,ip} 321 subs r2, r2, #0x00000008 /* Next */ 322 bgt .Linsw16loop 323 324 pop {r4,r5,pc} /* Restore regs and go home */ 325END(insw16) 326 327 328/* 329 * Writes short ints (16 bits) from a block of memory to an I/O address 330 * 331 * r0 = address to write to (IO) 332 * r1 = address to read from (memory) 333 * r2 = length 334 */ 335 336ENTRY(outsw16) 337/* Make sure that we have a positive length */ 338 cmp r2, #0x00000000 339 RETc(le) 340 341/* If the destination address is word aligned and the size suitably 342 aligned, do it fast */ 343 344 tst r2, #0x00000007 345 tsteq r1, #0x00000003 346 347 bne _C_LABEL(outsw) 348 349/* Word aligned outsw */ 350 351 push {r4,r5,lr} 352 353.Loutsw16loop: 354 ldmia r1!, {r4,r5,ip,lr} 355 356 eor r3, r4, r4, lsl #16 /* r3 = (A^B)(B) */ 357 eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 358 eor r3, r3, r4, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 359 str r3, [r0] 360 str r4, [r0] 361 362/* mov r3, r4, lsl #16 363 * orr r3, r3, r3, lsr #16 364 * str r3, [r0] 365 * 366 * mov r3, r4, lsr #16 367 * orr r3, r3, r3, lsl #16 368 * str r3, [r0] 369 */ 370 371 eor r3, r5, r5, lsl #16 /* r3 = (A^B)(B) */ 372 eor r5, r5, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 373 eor r3, r3, r5, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 374 str r3, [r0] 375 str r5, [r0] 376 377 eor r3, ip, ip, lsl #16 /* r3 = (A^B)(B) */ 378 eor ip, ip, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 379 eor r3, r3, ip, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 380 str r3, [r0] 381 str ip, [r0] 382 383 eor r3, lr, lr, lsl #16 /* r3 = (A^B)(B) */ 384 eor lr, lr, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 385 eor r3, r3, lr, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 386 str r3, [r0] 387 str lr, [r0] 388 389 subs r2, r2, #0x00000008 390 bgt .Loutsw16loop 391 392 pop {r4,r5,pc} /* and go home */ 393END(outsw16) 394 395/* 396 * reads short ints (16 bits) from an I/O address into a block of memory 397 * The I/O address is assumed to be mapped multiple times in a block of 398 * 8 words. 399 * The destination address should be word aligned. 400 * 401 * r0 = address to read from (IO) 402 * r1 = address to write to (memory) 403 * r2 = length 404 */ 405 406ENTRY(inswm8) 407/* Make sure that we have a positive length */ 408 cmp r2, #0x00000000 409 RETc(le) 410 411/* If the destination address is word aligned and the size suitably 412 aligned, do it fast */ 413 414 tst r1, #0x00000003 415 416 bne _C_LABEL(insw) 417 418/* Word aligned insw */ 419 420 push {r4-r9,lr} 421 422 mov lr, #0xff000000 423 orr lr, lr, #0x00ff0000 424 425.Linswm8_loop8: 426 cmp r2, #8 427 bcc .Linswm8_l8 428 429 ldmia r0, {r3-r9,ip} 430 431 bic r3, r3, lr 432 orr r3, r3, r4, lsl #16 433 bic r5, r5, lr 434 orr r4, r5, r6, lsl #16 435 bic r7, r7, lr 436 orr r5, r7, r8, lsl #16 437 bic r9, r9, lr 438 orr r6, r9, ip, lsl #16 439 440 stmia r1!, {r3-r6} 441 442 subs r2, r2, #0x00000008 /* Next */ 443 bne .Linswm8_loop8 444 beq .Linswm8_l1 445 446.Linswm8_l8: 447 cmp r2, #4 448 bcc .Linswm8_l4 449 450 ldmia r0, {r3-r6} 451 452 bic r3, r3, lr 453 orr r3, r3, r4, lsl #16 454 bic r5, r5, lr 455 orr r4, r5, r6, lsl #16 456 457 stmia r1!, {r3-r4} 458 459 subs r2, r2, #0x00000004 460 beq .Linswm8_l1 461 462.Linswm8_l4: 463 cmp r2, #2 464 bcc .Linswm8_l2 465 466 ldmia r0, {r3-r4} 467 468 bic r3, r3, lr 469 orr r3, r3, r4, lsl #16 470 str r3, [r1], #0x0004 471 472 subs r2, r2, #0x00000002 473 beq .Linswm8_l1 474 475.Linswm8_l2: 476 cmp r2, #1 477 bcc .Linswm8_l1 478 479 ldr r3, [r0] 480 subs r2, r2, #0x00000001 /* Test in load delay slot */ 481 /* XXX, why don't we use result? */ 482 483 strb r3, [r1], #0x0001 484 mov r3, r3, lsr #8 485 strb r3, [r1], #0x0001 486 487 488.Linswm8_l1: 489 pop {r4-r9,pc} /* And go home */ 490END(inswm8) 491 492/* 493 * write short ints (16 bits) to an I/O address from a block of memory 494 * The I/O address is assumed to be mapped multiple times in a block of 495 * 8 words. 496 * The source address should be word aligned. 497 * 498 * r0 = address to read to (IO) 499 * r1 = address to write from (memory) 500 * r2 = length 501 */ 502 503ENTRY(outswm8) 504/* Make sure that we have a positive length */ 505 cmp r2, #0x00000000 506 RETc(le) 507 508/* If the destination address is word aligned and the size suitably 509 aligned, do it fast */ 510 511 tst r1, #0x00000003 512 513 bne _C_LABEL(outsw) 514 515/* Word aligned outsw */ 516 517 push {r4-r8,lr} 518 519.Loutswm8_loop8: 520 cmp r2, #8 521 bcc .Loutswm8_l8 522 523 ldmia r1!, {r3,r5,r7,ip} 524 525 eor r4, r3, r3, lsr #16 /* r4 = (A)(A^B) */ 526 eor r3, r3, r4, lsl #16 /* r3 = (A^A^B)(B) = (B)(B) */ 527 eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 528 529 eor r6, r5, r5, lsr #16 /* r6 = (A)(A^B) */ 530 eor r5, r5, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */ 531 eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */ 532 533 eor r8, r7, r7, lsr #16 /* r8 = (A)(A^B) */ 534 eor r7, r7, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */ 535 eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */ 536 537 eor lr, ip, ip, lsr #16 /* lr = (A)(A^B) */ 538 eor ip, ip, lr, lsl #16 /* ip = (A^A^B)(B) = (B)(B) */ 539 eor lr, lr, ip, lsr #16 /* lr = (A)(B^A^B) = (A)(A) */ 540 541 stmia r0, {r3-r8,ip,lr} 542 543 subs r2, r2, #0x00000008 /* Next */ 544 bne .Loutswm8_loop8 545 beq .Loutswm8_l1 546 547.Loutswm8_l8: 548 cmp r2, #4 549 bcc .Loutswm8_l4 550 551 ldmia r1!, {r3-r4} 552 553 eor r6, r3, r3, lsr #16 /* r6 = (A)(A^B) */ 554 eor r5, r3, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */ 555 eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */ 556 557 eor r8, r4, r4, lsr #16 /* r8 = (A)(A^B) */ 558 eor r7, r4, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */ 559 eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */ 560 561 stmia r0, {r5-r8} 562 563 subs r2, r2, #0x00000004 564 beq .Loutswm8_l1 565 566.Loutswm8_l4: 567 cmp r2, #2 568 bcc .Loutswm8_l2 569 570 ldr r3, [r1], #0x0004 /* r3 = (A)(B) */ 571 subs r2, r2, #0x00000002 /* Done test in Load delay slot */ 572 573 eor r5, r3, r3, lsr #16 /* r5 = (A)(A^B)*/ 574 eor r4, r3, r5, lsl #16 /* r4 = (A^A^B)(B) = (B)(B) */ 575 eor r5, r5, r4, lsr #16 /* r5 = (A)(B^A^B) = (A)(A) */ 576 577 stmia r0, {r4, r5} 578 579 beq .Loutswm8_l1 580 581.Loutswm8_l2: 582 cmp r2, #1 583 bcc .Loutswm8_l1 584 585 ldrb r3, [r1], #0x0001 586 ldrb r4, [r1], #0x0001 587 subs r2, r2, #0x00000001 /* Done test in load delay slot */ 588 /* XXX This test isn't used? */ 589 orr r3, r3, r4, lsl #8 590 orr r3, r3, r3, lsl #16 591 str r3, [r0] 592 593.Loutswm8_l1: 594 pop {r4-r8,pc} /* And go home */ 595END(outswm8) 596