1/* $NetBSD: bcopyinout.S,v 1.23 2022/10/20 06:58:38 skrll Exp $ */ 2 3/* 4 * Copyright (c) 2002 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Allen Briggs for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38#include "opt_multiprocessor.h" 39#include "opt_cpuoptions.h" 40 41#include "assym.h" 42 43#include <machine/asm.h> 44 45#include <arm/locore.h> 46 47#if defined(__XSCALE__) || defined(_ARM_ARCH_6) 48/* 49 * armv6 and v7 have pld and strd so they can use the xscale 50 * bcopyinout as well. 51 */ 52#include "bcopyinout_xscale.S" 53#else 54 55RCSID("$NetBSD: bcopyinout.S,v 1.23 2022/10/20 06:58:38 skrll Exp $") 56 57 .text 58 .align 0 59 60#define SAVE_REGS stmfd sp!, {r4-r11} 61#define RESTORE_REGS ldmfd sp!, {r4-r11} 62 63#if defined(__XSCALE__) || defined(_ARM_ARCH_6) 64#define HELLOCPP # 65#define PREFETCH(rx,o) pld [ rx , HELLOCPP (o) ] 66#else 67#define PREFETCH(rx,o) 68#endif 69 70/* 71 * r0 = user space address 72 * r1 = kernel space address 73 * r2 = length 74 * 75 * Copies bytes from user space to kernel space 76 * 77 * We save/restore r4-r11: 78 * r4-r11 are scratch 79 */ 80ENTRY(copyin) 81 /* Quick exit if length is zero */ 82 teq r2, #0 83 moveq r0, #0 84 RETc(eq) 85 86 SAVE_REGS 87 GET_CURPCB(r4) 88 89 ldr r5, [r4, #PCB_ONFAULT] 90 adr r3, .Lcopyfault 91 str r3, [r4, #PCB_ONFAULT] 92 93 PREFETCH(r0, 0) 94 PREFETCH(r1, 0) 95 96 /* 97 * If not too many bytes, take the slow path. 98 */ 99 cmp r2, #0x08 100 blt .Licleanup 101 102 /* 103 * Align destination to word boundary. 104 */ 105 and r6, r1, #0x3 106 ldr pc, [pc, r6, lsl #2] 107 b .Lialend 108 .word .Lialend 109 .word .Lial3 110 .word .Lial2 111 .word .Lial1 112.Lial3: ldrbt r6, [r0], #1 113 sub r2, r2, #1 114 strb r6, [r1], #1 115.Lial2: ldrbt r7, [r0], #1 116 sub r2, r2, #1 117 strb r7, [r1], #1 118.Lial1: ldrbt r6, [r0], #1 119 sub r2, r2, #1 120 strb r6, [r1], #1 121.Lialend: 122 123 /* 124 * If few bytes left, finish slow. 125 */ 126 cmp r2, #0x08 127 blt .Licleanup 128 129 /* 130 * If source is not aligned, finish slow. 131 */ 132 ands r3, r0, #0x03 133 bne .Licleanup 134 135 cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ 136 blt .Licleanup8 137 138 /* 139 * Align destination to cacheline boundary. 140 * If source and destination are nicely aligned, this can be a big 141 * win. If not, it's still cheaper to copy in groups of 32 even if 142 * we don't get the nice cacheline alignment. 143 */ 144 and r6, r1, #0x1f 145 ldr pc, [pc, r6] 146 b .Licaligned 147 .word .Licaligned 148 .word .Lical28 149 .word .Lical24 150 .word .Lical20 151 .word .Lical16 152 .word .Lical12 153 .word .Lical8 154 .word .Lical4 155.Lical28:ldrt r6, [r0], #4 156 sub r2, r2, #4 157 str r6, [r1], #4 158.Lical24:ldrt r7, [r0], #4 159 sub r2, r2, #4 160 str r7, [r1], #4 161.Lical20:ldrt r6, [r0], #4 162 sub r2, r2, #4 163 str r6, [r1], #4 164.Lical16:ldrt r7, [r0], #4 165 sub r2, r2, #4 166 str r7, [r1], #4 167.Lical12:ldrt r6, [r0], #4 168 sub r2, r2, #4 169 str r6, [r1], #4 170.Lical8:ldrt r7, [r0], #4 171 sub r2, r2, #4 172 str r7, [r1], #4 173.Lical4:ldrt r6, [r0], #4 174 sub r2, r2, #4 175 str r6, [r1], #4 176 177 /* 178 * We start with > 0x40 bytes to copy (>= 0x60 got us into this 179 * part of the code, and we may have knocked that down by as much 180 * as 0x1c getting aligned). 181 * 182 * This loop basically works out to: 183 * do { 184 * prefetch-next-cacheline(s) 185 * bytes -= 0x20; 186 * copy cacheline 187 * } while (bytes >= 0x40); 188 * bytes -= 0x20; 189 * copy cacheline 190 */ 191.Licaligned: 192 PREFETCH(r0, 32) 193 PREFETCH(r1, 32) 194 195 sub r2, r2, #0x20 196 197 /* Copy a cacheline */ 198 ldrt r10, [r0], #4 199 ldrt r11, [r0], #4 200 ldrt r6, [r0], #4 201 ldrt r7, [r0], #4 202 ldrt r8, [r0], #4 203 ldrt r9, [r0], #4 204 stmia r1!, {r10-r11} 205 ldrt r10, [r0], #4 206 ldrt r11, [r0], #4 207 stmia r1!, {r6-r11} 208 209 cmp r2, #0x40 210 bge .Licaligned 211 212 sub r2, r2, #0x20 213 214 /* Copy a cacheline */ 215 ldrt r10, [r0], #4 216 ldrt r11, [r0], #4 217 ldrt r6, [r0], #4 218 ldrt r7, [r0], #4 219 ldrt r8, [r0], #4 220 ldrt r9, [r0], #4 221 stmia r1!, {r10-r11} 222 ldrt r10, [r0], #4 223 ldrt r11, [r0], #4 224 stmia r1!, {r6-r11} 225 226 cmp r2, #0x08 227 blt .Liprecleanup 228 229.Licleanup8: 230 ldrt r8, [r0], #4 231 ldrt r9, [r0], #4 232 sub r2, r2, #8 233 stmia r1!, {r8, r9} 234 cmp r2, #8 235 bge .Licleanup8 236 237.Liprecleanup: 238 /* 239 * If we're done, bail. 240 */ 241 cmp r2, #0 242 beq .Liout 243 244.Licleanup: 245 and r6, r2, #0x3 246 ldr pc, [pc, r6, lsl #2] 247 b .Licend 248 .word .Lic4 249 .word .Lic1 250 .word .Lic2 251 .word .Lic3 252.Lic4: ldrbt r6, [r0], #1 253 sub r2, r2, #1 254 strb r6, [r1], #1 255.Lic3: ldrbt r7, [r0], #1 256 sub r2, r2, #1 257 strb r7, [r1], #1 258.Lic2: ldrbt r6, [r0], #1 259 sub r2, r2, #1 260 strb r6, [r1], #1 261.Lic1: ldrbt r7, [r0], #1 262 subs r2, r2, #1 263 strb r7, [r1], #1 264.Licend: 265 bne .Licleanup 266 267.Liout: 268 mov r0, #0 269 270 str r5, [r4, #PCB_ONFAULT] 271 RESTORE_REGS 272 273 RET 274 275.Lcopyfault: 276 str r5, [r4, #PCB_ONFAULT] 277 RESTORE_REGS 278 279 RET 280END(copyin) 281 282/* 283 * r0 = kernel space address 284 * r1 = user space address 285 * r2 = length 286 * 287 * Copies bytes from kernel space to user space 288 * 289 * We save/restore r4-r11: 290 * r4-r11 are scratch 291 */ 292 293ENTRY(copyout) 294 /* Quick exit if length is zero */ 295 teq r2, #0 296 moveq r0, #0 297 moveq pc, lr 298 299 SAVE_REGS 300 GET_CURPCB(r4) 301 302 ldr r5, [r4, #PCB_ONFAULT] 303 adr r3, .Lcopyfault 304 str r3, [r4, #PCB_ONFAULT] 305 306 PREFETCH(r0, 0) 307 PREFETCH(r1, 0) 308 309 /* 310 * If not too many bytes, take the slow path. 311 */ 312 cmp r2, #0x08 313 blt .Lcleanup 314 315 /* 316 * Align destination to word boundary. 317 */ 318 and r6, r1, #0x3 319 ldr pc, [pc, r6, lsl #2] 320 b .Lalend 321 .word .Lalend 322 .word .Lal3 323 .word .Lal2 324 .word .Lal1 325.Lal3: ldrb r6, [r0], #1 326 sub r2, r2, #1 327 strbt r6, [r1], #1 328.Lal2: ldrb r7, [r0], #1 329 sub r2, r2, #1 330 strbt r7, [r1], #1 331.Lal1: ldrb r6, [r0], #1 332 sub r2, r2, #1 333 strbt r6, [r1], #1 334.Lalend: 335 336 /* 337 * If few bytes left, finish slow. 338 */ 339 cmp r2, #0x08 340 blt .Lcleanup 341 342 /* 343 * If source is not aligned, finish slow. 344 */ 345 ands r3, r0, #0x03 346 bne .Lcleanup 347 348 cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ 349 blt .Lcleanup8 350 351 /* 352 * Align source & destination to cacheline boundary. 353 */ 354 and r6, r1, #0x1f 355 ldr pc, [pc, r6] 356 b .Lcaligned 357 .word .Lcaligned 358 .word .Lcal28 359 .word .Lcal24 360 .word .Lcal20 361 .word .Lcal16 362 .word .Lcal12 363 .word .Lcal8 364 .word .Lcal4 365.Lcal28:ldr r6, [r0], #4 366 sub r2, r2, #4 367 strt r6, [r1], #4 368.Lcal24:ldr r7, [r0], #4 369 sub r2, r2, #4 370 strt r7, [r1], #4 371.Lcal20:ldr r6, [r0], #4 372 sub r2, r2, #4 373 strt r6, [r1], #4 374.Lcal16:ldr r7, [r0], #4 375 sub r2, r2, #4 376 strt r7, [r1], #4 377.Lcal12:ldr r6, [r0], #4 378 sub r2, r2, #4 379 strt r6, [r1], #4 380.Lcal8: ldr r7, [r0], #4 381 sub r2, r2, #4 382 strt r7, [r1], #4 383.Lcal4: ldr r6, [r0], #4 384 sub r2, r2, #4 385 strt r6, [r1], #4 386 387 /* 388 * We start with > 0x40 bytes to copy (>= 0x60 got us into this 389 * part of the code, and we may have knocked that down by as much 390 * as 0x1c getting aligned). 391 * 392 * This loop basically works out to: 393 * do { 394 * prefetch-next-cacheline(s) 395 * bytes -= 0x20; 396 * copy cacheline 397 * } while (bytes >= 0x40); 398 * bytes -= 0x20; 399 * copy cacheline 400 */ 401.Lcaligned: 402 PREFETCH(r0, 32) 403 PREFETCH(r1, 32) 404 405 sub r2, r2, #0x20 406 407 /* Copy a cacheline */ 408 ldmia r0!, {r6-r11} 409 strt r6, [r1], #4 410 strt r7, [r1], #4 411 ldmia r0!, {r6-r7} 412 strt r8, [r1], #4 413 strt r9, [r1], #4 414 strt r10, [r1], #4 415 strt r11, [r1], #4 416 strt r6, [r1], #4 417 strt r7, [r1], #4 418 419 cmp r2, #0x40 420 bge .Lcaligned 421 422 sub r2, r2, #0x20 423 424 /* Copy a cacheline */ 425 ldmia r0!, {r6-r11} 426 strt r6, [r1], #4 427 strt r7, [r1], #4 428 ldmia r0!, {r6-r7} 429 strt r8, [r1], #4 430 strt r9, [r1], #4 431 strt r10, [r1], #4 432 strt r11, [r1], #4 433 strt r6, [r1], #4 434 strt r7, [r1], #4 435 436 cmp r2, #0x08 437 blt .Lprecleanup 438 439.Lcleanup8: 440 ldmia r0!, {r8-r9} 441 sub r2, r2, #8 442 strt r8, [r1], #4 443 strt r9, [r1], #4 444 cmp r2, #8 445 bge .Lcleanup8 446 447.Lprecleanup: 448 /* 449 * If we're done, bail. 450 */ 451 cmp r2, #0 452 beq .Lout 453 454.Lcleanup: 455 and r6, r2, #0x3 456 ldr pc, [pc, r6, lsl #2] 457 b .Lcend 458 .word .Lc4 459 .word .Lc1 460 .word .Lc2 461 .word .Lc3 462.Lc4: ldrb r6, [r0], #1 463 sub r2, r2, #1 464 strbt r6, [r1], #1 465.Lc3: ldrb r7, [r0], #1 466 sub r2, r2, #1 467 strbt r7, [r1], #1 468.Lc2: ldrb r6, [r0], #1 469 sub r2, r2, #1 470 strbt r6, [r1], #1 471.Lc1: ldrb r7, [r0], #1 472 subs r2, r2, #1 473 strbt r7, [r1], #1 474.Lcend: 475 bne .Lcleanup 476 477.Lout: 478 mov r0, #0 479 480 str r5, [r4, #PCB_ONFAULT] 481 RESTORE_REGS 482 483 RET 484END(copyout) 485 486/* 487 * r0 = kernel space source address 488 * r1 = kernel space destination address 489 * r2 = length 490 * 491 * Copies bytes from kernel space to kernel space, aborting on page fault 492 * 493 * Copy of copyout, but without the ldrt/strt instructions. 494 */ 495 496ENTRY(kcopy) 497 /* Quick exit if length is zero */ 498 teq r2, #0 499 moveq r0, #0 500 moveq pc, lr 501 502 SAVE_REGS 503 GET_CURPCB(r4) 504 505 ldr r5, [r4, #PCB_ONFAULT] 506 adr r3, .Lcopyfault 507 str r3, [r4, #PCB_ONFAULT] 508 509 PREFETCH(r0, 0) 510 PREFETCH(r1, 0) 511 512 /* 513 * If not too many bytes, take the slow path. 514 */ 515 cmp r2, #0x08 516 blt .Lkcleanup 517 518 /* 519 * Align destination to word boundary. 520 */ 521 and r6, r1, #0x3 522 ldr pc, [pc, r6, lsl #2] 523 b .Lkalend 524 .word .Lkalend 525 .word .Lkal3 526 .word .Lkal2 527 .word .Lkal1 528.Lkal3: ldrb r6, [r0], #1 529 sub r2, r2, #1 530 strb r6, [r1], #1 531.Lkal2: ldrb r7, [r0], #1 532 sub r2, r2, #1 533 strb r7, [r1], #1 534.Lkal1: ldrb r6, [r0], #1 535 sub r2, r2, #1 536 strb r6, [r1], #1 537.Lkalend: 538 539 /* 540 * If few bytes left, finish slow. 541 */ 542 cmp r2, #0x08 543 blt .Lkcleanup 544 545 /* 546 * If source is not aligned, finish slow. 547 */ 548 ands r3, r0, #0x03 549 bne .Lkcleanup 550 551 cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ 552 blt .Lkcleanup8 553 554 /* 555 * Align source & destination to cacheline boundary. 556 */ 557 and r6, r1, #0x1f 558 ldr pc, [pc, r6] 559 b .Lkcaligned 560 .word .Lkcaligned 561 .word .Lkcal28 562 .word .Lkcal24 563 .word .Lkcal20 564 .word .Lkcal16 565 .word .Lkcal12 566 .word .Lkcal8 567 .word .Lkcal4 568.Lkcal28:ldr r6, [r0], #4 569 sub r2, r2, #4 570 str r6, [r1], #4 571.Lkcal24:ldr r7, [r0], #4 572 sub r2, r2, #4 573 str r7, [r1], #4 574.Lkcal20:ldr r6, [r0], #4 575 sub r2, r2, #4 576 str r6, [r1], #4 577.Lkcal16:ldr r7, [r0], #4 578 sub r2, r2, #4 579 str r7, [r1], #4 580.Lkcal12:ldr r6, [r0], #4 581 sub r2, r2, #4 582 str r6, [r1], #4 583.Lkcal8:ldr r7, [r0], #4 584 sub r2, r2, #4 585 str r7, [r1], #4 586.Lkcal4:ldr r6, [r0], #4 587 sub r2, r2, #4 588 str r6, [r1], #4 589 590 /* 591 * We start with > 0x40 bytes to copy (>= 0x60 got us into this 592 * part of the code, and we may have knocked that down by as much 593 * as 0x1c getting aligned). 594 * 595 * This loop basically works out to: 596 * do { 597 * prefetch-next-cacheline(s) 598 * bytes -= 0x20; 599 * copy cacheline 600 * } while (bytes >= 0x40); 601 * bytes -= 0x20; 602 * copy cacheline 603 */ 604.Lkcaligned: 605 PREFETCH(r0, 32) 606 PREFETCH(r1, 32) 607 608 sub r2, r2, #0x20 609 610 /* Copy a cacheline */ 611 ldmia r0!, {r6-r11} 612 stmia r1!, {r6, r7} 613 ldmia r0!, {r6, r7} 614 stmia r1!, {r8-r11} 615 stmia r1!, {r6, r7} 616 617 cmp r2, #0x40 618 bge .Lkcaligned 619 620 sub r2, r2, #0x20 621 622 /* Copy a cacheline */ 623 ldmia r0!, {r6-r11} 624 stmia r1!, {r6-r7} 625 ldmia r0!, {r6-r7} 626 stmia r1!, {r8-r11} 627 stmia r1!, {r6-r7} 628 629 cmp r2, #0x08 630 blt .Lkprecleanup 631 632.Lkcleanup8: 633 ldmia r0!, {r8-r9} 634 sub r2, r2, #8 635 stmia r1!, {r8-r9} 636 cmp r2, #8 637 bge .Lkcleanup8 638 639.Lkprecleanup: 640 /* 641 * If we're done, bail. 642 */ 643 cmp r2, #0 644 beq .Lkout 645 646.Lkcleanup: 647 and r6, r2, #0x3 648 ldr pc, [pc, r6, lsl #2] 649 b .Lkcend 650 .word .Lkc4 651 .word .Lkc1 652 .word .Lkc2 653 .word .Lkc3 654.Lkc4: ldrb r6, [r0], #1 655 sub r2, r2, #1 656 strb r6, [r1], #1 657.Lkc3: ldrb r7, [r0], #1 658 sub r2, r2, #1 659 strb r7, [r1], #1 660.Lkc2: ldrb r6, [r0], #1 661 sub r2, r2, #1 662 strb r6, [r1], #1 663.Lkc1: ldrb r7, [r0], #1 664 subs r2, r2, #1 665 strb r7, [r1], #1 666.Lkcend: 667 bne .Lkcleanup 668 669.Lkout: 670 mov r0, #0 671 672 str r5, [r4, #PCB_ONFAULT] 673 RESTORE_REGS 674 675 RET 676END(kcopy) 677#endif /* !__XSCALE__ */ 678 679/* 680 * int badaddr_read_1(const uint8_t *src, uint8_t *dest) 681 * 682 * Copies a single 8-bit value from src to dest, returning 0 on success, 683 * else EFAULT if a page fault occurred. 684 */ 685ENTRY(badaddr_read_1) 686 GET_CURPCB(r2) 687 ldr ip, [r2, #PCB_ONFAULT] 688 adr r3, 1f 689 str r3, [r2, #PCB_ONFAULT] 690 nop 691 nop 692 nop 693 ldrb r3, [r0] 694 nop 695 nop 696 nop 697 strb r3, [r1] 698 mov r0, #0 /* No fault */ 6991: str ip, [r2, #PCB_ONFAULT] 700 RET 701END(badaddr_read_1) 702 703/* 704 * int badaddr_read_2(const uint16_t *src, uint16_t *dest) 705 * 706 * Copies a single 16-bit value from src to dest, returning 0 on success, 707 * else EFAULT if a page fault occurred. 708 */ 709ENTRY(badaddr_read_2) 710 GET_CURPCB(r2) 711 ldr ip, [r2, #PCB_ONFAULT] 712 adr r3, 1f 713 str r3, [r2, #PCB_ONFAULT] 714 nop 715 nop 716 nop 717 ldrh r3, [r0] 718 nop 719 nop 720 nop 721 strh r3, [r1] 722 mov r0, #0 /* No fault */ 7231: str ip, [r2, #PCB_ONFAULT] 724 RET 725END(badaddr_read_2) 726 727/* 728 * int badaddr_read_4(const uint32_t *src, uint32_t *dest) 729 * 730 * Copies a single 32-bit value from src to dest, returning 0 on success, 731 * else EFAULT if a page fault occurred. 732 */ 733ENTRY(badaddr_read_4) 734 GET_CURPCB(r2) 735 ldr ip, [r2, #PCB_ONFAULT] 736 adr r3, 1f 737 str r3, [r2, #PCB_ONFAULT] 738 nop 739 nop 740 nop 741 ldr r3, [r0] 742 nop 743 nop 744 nop 745 str r3, [r1] 746 mov r0, #0 /* No fault */ 7471: str ip, [r2, #PCB_ONFAULT] 748 RET 749END(badaddr_read_4) 750