1/* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */ 2 3/* 4 * Copyright (c) 2018 Ryo Shimizu 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <machine/asm.h> 30 31#if defined(LIBC_SCCS) 32RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $") 33#endif 34 35#if defined(MEMCOPY) 36 37/* 38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len); 39 */ 40#define FUNCTION memcpy 41#define NO_OVERLAP 42#define SRC0 x1 43#define DST0 x0 44#define LEN x2 45 46#elif defined(MEMMOVE) 47 48/* 49 * void *memmove(void *dst, const void *src, size_t len); 50 */ 51#define FUNCTION memmove 52#undef NO_OVERLAP 53#define SRC0 x1 54#define DST0 x0 55#define LEN x2 56 57#else /* !MEMCOPY && !MEMMOVE */ 58 59/* 60 * void bcopy(const void *src, void *dst, size_t len); 61 */ 62#define FUNCTION bcopy 63#define NO_OVERLAP 64#define SRC0 x0 65#define DST0 x1 66#define LEN x2 67 68#endif /* MEMCOPY/MEMMOVE/BCOPY */ 69 70/* caller-saved temporary registers. breakable. */ 71#define TMP_X x3 72#define TMP_Xw w3 73#define TMP_D x4 74#define TMP_S x5 75#define DST x6 76#define SRC x7 77#define DATA0 x8 78#define DATA0w w8 79#define DATA1 x9 80#define DATA1w w9 81#define DATA2 x10 82#define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */ 83#define DST_ALIGNBIT x12 /* (DST & 7) * 8 */ 84#define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */ 85#define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */ 86 87#define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */ 88#define SMALLSIZE 32 89 90 .text 91 .align 5 92 93#ifndef NO_OVERLAP 94#ifndef STRICT_ALIGNMENT 95backward_ignore_align: 96 prfm PLDL1KEEP, [SRC0] 97 add SRC0, SRC0, LEN 98 add DST, DST0, LEN 99 cmp LEN, #SMALLSIZE 100 bcs copy_backward 101copy_backward_small: 102 cmp LEN, #8 103 bcs 9f 104 105 /* 0 <= len < 8 */ 106 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 107 tbz LEN, #2, 1f 108 ldr TMP_Xw, [SRC0, #-4]! 109 str TMP_Xw, [DST, #-4]! 1101: 111 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 112 tbz LEN, #1, 1f 113 ldrh TMP_Xw, [SRC0, #-2]! 114 strh TMP_Xw, [DST, #-2]! 1151: 116 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 117 tbz LEN, #0, 1f 118 ldrb TMP_Xw, [SRC0, #-1]! 119 strb TMP_Xw, [DST, #-1]! 1201: 121 ret 1229: 123 124 cmp LEN, #16 125 bcs 9f 126 127 /* 8 <= len < 16 */ 128 /* *--(uint64_t *)dst = *--(uint64_t *)src; */ 129 ldr TMP_X, [SRC0, #-8]! 130 str TMP_X, [DST, #-8]! 131 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 132 tbz LEN, #2, 1f 133 ldr TMP_Xw, [SRC0, #-4]! 134 str TMP_Xw, [DST, #-4]! 1351: 136 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 137 tbz LEN, #1, 1f 138 ldrh TMP_Xw, [SRC0, #-2]! 139 strh TMP_Xw, [DST, #-2]! 1401: 141 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 142 tbz LEN, #0, 1f 143 ldrb TMP_Xw, [SRC0, #-1]! 144 strb TMP_Xw, [DST, #-1]! 1451: 146 ret 1479: 148 149 /* 16 <= len < 32 */ 150 ldp DATA0, DATA1, [SRC0, #-16]! 151 stp DATA0, DATA1, [DST, #-16]! 152 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 153 tbz LEN, #3, 1f 154 ldr TMP_X, [SRC0, #-8]! 155 str TMP_X, [DST, #-8]! 1561: 157 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 158 tbz LEN, #2, 1f 159 ldr TMP_Xw, [SRC0, #-4]! 160 str TMP_Xw, [DST, #-4]! 1611: 162 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 163 tbz LEN, #1, 1f 164 ldrh TMP_Xw, [SRC0, #-2]! 165 strh TMP_Xw, [DST, #-2]! 1661: 167 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 168 tbz LEN, #0, 1f 169 ldrb TMP_Xw, [SRC0, #-1]! 170 strb TMP_Xw, [DST, #-1]! 1711: 172 ret 173#endif /* !STRICT_ALIGNMENT */ 174 175 .align 4 176copy_backward: 177 /* DST is not aligned at this point */ 178#ifndef STRICT_ALIGNMENT 179 cmp LEN, #512 /* pre-alignment can be overhead when small */ 180 bcc 9f 181#endif 182 /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 183 tbz DST, #0, 1f 184 ldrb TMP_Xw, [SRC0, #-1]! 185 strb TMP_Xw, [DST, #-1]! 186 sub LEN, LEN, #1 1871: 188 /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 189 tbz DST, #1, 1f 190 ldrh TMP_Xw, [SRC0, #-2]! 191 strh TMP_Xw, [DST, #-2]! 192 sub LEN, LEN, #2 1931: 194 /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 195 tbz DST, #2, 1f 196 ldr TMP_Xw, [SRC0, #-4]! 197 str TMP_Xw, [DST, #-4]! 198 sub LEN, LEN, #4 1991: 200#if (STP_ALIGN > 8) 201 /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 202 tbz DST, #3, 1f 203 ldr TMP_X, [SRC0, #-8]! 204 str TMP_X, [DST, #-8]! 205 sub LEN, LEN, #8 2061: 207#endif /* (STP_ALIGN > 8) */ 2089: 209 210backward_copy1k: 211 /* while (len >= 1024) */ 212 /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */ 213 cmp LEN, #1024 214 blo 9f 2151: 216 sub LEN, LEN, #1024 217 .rept (1024 / 16) 218 ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */ 219 stp DATA0, DATA1, [DST, #-16]! 220 .endr 221 cmp LEN, #1024 222 bhs 1b 2239: 224 225 /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */ 226 tbz LEN, #9, 1f 227 .rept (512 / 16) 228 ldp DATA0, DATA1, [SRC0, #-16]! 229 stp DATA0, DATA1, [DST, #-16]! 230 .endr 2311: 232 /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */ 233 tbz LEN, #8, 1f 234 .rept (256 / 16) 235 ldp DATA0, DATA1, [SRC0, #-16]! 236 stp DATA0, DATA1, [DST, #-16]! 237 .endr 2381: 239 /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */ 240 tbz LEN, #7, 1f 241 .rept (128 / 16) 242 ldp DATA0, DATA1, [SRC0, #-16]! 243 stp DATA0, DATA1, [DST, #-16]! 244 .endr 2451: 246 /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */ 247 tbz LEN, #6, 1f 248 .rept (64 / 16) 249 ldp DATA0, DATA1, [SRC0, #-16]! 250 stp DATA0, DATA1, [DST, #-16]! 251 .endr 2521: 253 /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */ 254 tbz LEN, #5, 1f 255 .rept (32 / 16) 256 ldp DATA0, DATA1, [SRC0, #-16]! 257 stp DATA0, DATA1, [DST, #-16]! 258 .endr 2591: 260 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 261 tbz LEN, #4, 1f 262 ldp DATA0, DATA1, [SRC0, #-16]! 263 stp DATA0, DATA1, [DST, #-16]! 2641: 265 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 266 tbz LEN, #3, 1f 267 ldr TMP_X, [SRC0, #-8]! 268 str TMP_X, [DST, #-8]! 2691: 270 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 271 tbz LEN, #2, 1f 272 ldr TMP_Xw, [SRC0, #-4]! 273 str TMP_Xw, [DST, #-4]! 2741: 275 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 276 tbz LEN, #1, 1f 277 ldrh TMP_Xw, [SRC0, #-2]! 278 strh TMP_Xw, [DST, #-2]! 2791: 280 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 281 tbz LEN, #0, 1f 282 ldrb TMP_Xw, [SRC0, #-1]! 283 strb TMP_Xw, [DST, #-1]! 2841: 285 ret 286#endif /* !NO_OVERLAP */ 287 288 289#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) 290 .align 5 291backward_copy: 292 prfm PLDL1KEEP, [SRC0] 293 add DST, DST0, LEN 294 add SRC0, SRC0, LEN 295 cmp LEN, #SMALLSIZE 296 bcs strict_backward 297 298 cmp LEN, #10 299 bcs 9f 300backward_tiny: 301 /* copy 1-10 bytes */ 3021: sub LEN, LEN, #1 303 ldrb TMP_Xw, [SRC0, #-1]! 304 strb TMP_Xw, [DST, #-1]! 305 cbz LEN, 1b 306 ret 3079: 308 /* length is small(<32), and src or dst may be unaligned */ 309 eor TMP_X, SRC0, DST 310 ands TMP_X, TMP_X, #7 311 bne notaligned_backward_small 312 313samealign_backward_small: 314 /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 315 tbz DST, #0, 1f 316 ldrb TMP_Xw, [SRC0, #-1]! 317 strb TMP_Xw, [DST, #-1]! 318 sub LEN, LEN, #1 3191: 320 /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 321 tbz DST, #1, 1f 322 ldrh TMP_Xw, [SRC0, #-2]! 323 strh TMP_Xw, [DST, #-2]! 324 sub LEN, LEN, #2 3251: 326 /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 327 tbz DST, #2, 1f 328 ldr TMP_Xw, [SRC0, #-4]! 329 str TMP_Xw, [DST, #-4]! 330 sub LEN, LEN, #4 3311: 332 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 333 tbz LEN, #4, 1f 334 ldp DATA0, DATA1, [SRC0, #-16]! 335 stp DATA0, DATA1, [DST, #-16]! 3361: 337 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 338 tbz LEN, #3, 1f 339 ldr TMP_X, [SRC0, #-8]! 340 str TMP_X, [DST, #-8]! 3411: 342 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 343 tbz LEN, #2, 1f 344 ldr TMP_Xw, [SRC0, #-4]! 345 str TMP_Xw, [DST, #-4]! 3461: 347 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 348 tbz LEN, #1, 1f 349 ldrh TMP_Xw, [SRC0, #-2]! 350 strh TMP_Xw, [DST, #-2]! 3511: 352 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 353 tbz LEN, #0, 1f 354 ldrb TMP_Xw, [SRC0, #-1]! 355 strb TMP_Xw, [DST, #-1]! 3561: 357 ret 358 359notaligned_backward_small: 360 /* length is small, and src or dst may be unaligned */ 361 sub TMP_S, SRC0, LEN /* tmp_s = src - len */ 3621: /* do { */ 363 ldrb TMP_Xw, [SRC0, #-1]! 364 strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */ 365 cmp TMP_S, SRC0 /* while (tmp_s < src) */ 366 blo 1b 367 ret 368 369strict_backward: 370 /* src or dst may be unaligned */ 371 and SRC_ALIGNBIT, SRC0, #7 372 and DST_ALIGNBIT, DST, #7 373 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 374 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 375 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 376 cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */ 377 378 and SRC, SRC0, #~7 379 and DST, DST, #~7 380 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 381 382#if BYTE_ORDER == LITTLE_ENDIAN 383 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 384 385 cmp SRC, SRC0 /* don't access out of range */ 386 beq 1f 387 ldr DATA1, [SRC] 3881: 389 ldr DATA0, [SRC, #-8]! 390 391 lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 392 lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */ 393 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 394 395 b 9f /* } */ 3965: /* else { */ 397 ldr DATA0, [SRC] /* data0 = *src; */ 398 lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/ 3999: /* } */ 400 401 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 402 mov TMP_D, DST /* tmp_d = dst; */ 403 404 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 405 str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */ 406 lsr DATA1, DATA1, #32 /* data1 >>= 32; */ 4071: /* } */ 408 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 409 strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */ 410 lsr DATA1, DATA1, #16 /* data1 >>= 16; */ 4111: /* } */ 412 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 413 strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */ 4141: /* } */ 415 416 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 4179: /* } */ 418#else /* BYTE_ORDER */ 419 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 420 421 cmp SRC, SRC0 /* don't access out of range */ 422 beq 1f 423 ldr DATA1, [SRC] 4241: 425 ldr DATA0, [SRC, #-8]! 426 427 lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 428 lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */ 429 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 430 431 b 9f /* } */ 4325: /* else { */ 433 ldr DATA0, [SRC] /* data0 = *src; */ 434 lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/ 4359: /* } */ 436 437 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 438 mov TMP_D, DST /* tmp_d = dst; */ 439 440 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 441 lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */ 442 str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */ 4431: /* } */ 444 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 445 lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */ 446 strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */ 4471: /* } */ 448 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 449 lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */ 450 strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */ 4511: /* } */ 452 453 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 4549: /* } */ 455#endif /* BYTE_ORDER */ 456 457 458backward_shifting_copy_loop: 459 ldp DATA2, DATA1, [SRC, #-16]! 460#if BYTE_ORDER == LITTLE_ENDIAN 461 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 462 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 463 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 464 orr DATA0, DATA0, TMP_X 465 /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */ 466 lsl DATA1, DATA1, DST_SRC_ALIGNBIT 467 lsr TMP_X, DATA2, SRC_DST_ALIGNBIT 468 orr DATA1, DATA1, TMP_X 469#else /* BYTE_ORDER */ 470 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 471 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 472 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 473 orr DATA0, DATA0, TMP_X 474 /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */ 475 lsr DATA1, DATA1, DST_SRC_ALIGNBIT 476 lsl TMP_X, DATA2, SRC_DST_ALIGNBIT 477 orr DATA1, DATA1, TMP_X 478#endif /* BYTE_ORDER */ 479 stp DATA1, DATA0, [DST, #-16]! 480 mov DATA0, DATA2 481 sub LEN, LEN, #16 482 cmp LEN, #16 483 bhs backward_shifting_copy_loop 484 485 486 /* write 8 bytes */ 487 tbz LEN, #3, 9f 488 489 ldr DATA1, [SRC, #-8]! 490#if BYTE_ORDER == LITTLE_ENDIAN 491 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 492 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 493 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 494 orr DATA0, DATA0, TMP_X 495#else /* BYTE_ORDER */ 496 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 497 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 498 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 499 orr DATA0, DATA0, TMP_X 500#endif /* BYTE_ORDER */ 501 str DATA0, [DST, #-8]! 502 mov DATA0, DATA1 503 sub LEN, LEN, #8 5049: 505 506 cbz LEN, backward_shifting_copy_done 507 508 /* copy last 1-7 bytes */ 509 and TMP_X, SRC_DST_ALIGNBIT, #63 510 cmp LEN, TMP_X, lsr #3 511 bls 1f 512 ldr DATA1, [SRC, #-8]! /* don't access out of range */ 5131: 514 515#if BYTE_ORDER == LITTLE_ENDIAN 516 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 517 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 518 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 519 orr DATA0, DATA0, TMP_X 520#else /* BYTE_ORDER */ 521 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 522 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 523 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 524 orr DATA0, DATA0, TMP_X 525#endif /* BYTE_ORDER */ 526 527#if BYTE_ORDER == LITTLE_ENDIAN 528 tbz LEN, #2, 1f 529 ror DATA0, DATA0, #32 530 str DATA0w, [DST, #-4]! 5311: 532 tbz LEN, #1, 1f 533 ror DATA0, DATA0, #48 534 strh DATA0w, [DST, #-2]! 5351: 536 tbz LEN, #0, 1f 537 ror DATA0, DATA0, #56 538 strb DATA0w, [DST, #-1]! 5391: 540#else /* BYTE_ORDER */ 541 tbz LEN, #2, 1f 542 str DATA0w, [DST, #-4]! 543 lsr DATA0, DATA0, #32 5441: 545 tbz LEN, #1, 1f 546 strh DATA0w, [DST, #-2]! 547 lsr DATA0, DATA0, #16 5481: 549 tbz LEN, #0, 1f 550 strb DATA0w, [DST, #-1]! 5511: 552#endif /* BYTE_ORDER */ 553backward_shifting_copy_done: 554 ret 555#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */ 556 557 558 .align 5 559ENTRY(FUNCTION) 560#ifdef STRICT_ALIGNMENT 561 cbz LEN, done 562#ifndef NO_OVERLAP 563 cmp SRC0, DST0 564 beq done 565 bcc backward_copy 566#endif /* NO_OVERLAP */ 567 mov DST, DST0 568 cmp LEN, #SMALLSIZE 569 bcs strict_forward 570 571 cmp LEN, #10 572 bcs 9f 573forward_tiny: 574 /* copy 1-10 bytes */ 5751: sub LEN, LEN, #1 576 ldrb TMP_Xw, [SRC0], #1 577 strb TMP_Xw, [DST], #1 578 cbz LEN, 1b 579 ret 5809: 581 /* length is small(<32), and src or dst may be unaligned */ 582 eor TMP_X, SRC0, DST0 583 ands TMP_X, TMP_X, #7 584 bne notaligned_forward_small 585samealign_forward_small: 586 /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 587 tbz DST, #0, 1f 588 ldrb TMP_Xw, [SRC0], #1 589 strb TMP_Xw, [DST], #1 590 sub LEN, LEN, #1 5911: 592 /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 593 tbz DST, #1, 1f 594 ldrh TMP_Xw, [SRC0], #2 595 strh TMP_Xw, [DST], #2 596 sub LEN, LEN, #2 5971: 598 /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 599 tbz DST, #2, 1f 600 ldr TMP_Xw, [SRC0], #4 601 str TMP_Xw, [DST], #4 602 sub LEN, LEN, #4 6031: 604 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 605 tbz LEN, #4, 1f 606 ldp DATA0, DATA1, [SRC0], #16 607 stp DATA0, DATA1, [DST], #16 6081: 609 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 610 tbz LEN, #3, 1f 611 ldr TMP_X, [SRC0], #8 612 str TMP_X, [DST], #8 6131: 614 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 615 tbz LEN, #2, 1f 616 ldr TMP_Xw, [SRC0], #4 617 str TMP_Xw, [DST], #4 6181: 619 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 620 tbz LEN, #1, 1f 621 ldrh TMP_Xw, [SRC0], #2 622 strh TMP_Xw, [DST], #2 6231: 624 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 625 tbz LEN, #0, 1f 626 ldrb TMP_Xw, [SRC0], #1 627 strb TMP_Xw, [DST], #1 6281: 629 ret 630 631notaligned_forward_small: 632 /* src and dst are not aligned... */ 633 prfm PLDL1KEEP, [SRC0] 634 prfm PLDL1KEEP, [SRC0, #8] 635 prfm PLDL1KEEP, [SRC0, #16] 636 add TMP_S, SRC0, LEN /* tmp_s = src + len */ 6371: /* do { */ 638 ldrb TMP_Xw, [SRC0], #1 639 strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */ 640 cmp SRC0, TMP_S /* while (src < tmp_s); */ 641 blo 1b 642 ret 643 644strict_forward: 645 /* src or dst may be unaligned */ 646 and SRC_ALIGNBIT, SRC0, #7 647 and DST_ALIGNBIT, DST0, #7 648 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 649 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 650 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 651 cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */ 652 653 and SRC, SRC0, #~7 654 and DST, DST0, #~7 655 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 656 657#if BYTE_ORDER == LITTLE_ENDIAN 658 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 659 ldp DATA1, DATA0, [SRC], #16 660 neg TMP_X, SRC_ALIGNBIT 661 lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 662 lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */ 663 orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */ 664 b 9f 6655: 666 ldr DATA0, [SRC], #8 667 lsr DATA1, DATA0, SRC_ALIGNBIT 6689: 669 670 cbz DST_ALIGNBIT, 5f 671 mov TMP_D, DST0 672 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */ 673 tbz TMP_D, #0, 1f 674 strb DATA1w, [TMP_D], #1 675 lsr DATA1, DATA1, #8 6761: 677 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */ 678 tbz TMP_D, #1, 1f 679 strh DATA1w, [TMP_D], #2 680 lsr DATA1, DATA1, #16 6811: 682 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */ 683 tbz TMP_D, #2, 1f 684 str DATA1w, [TMP_D], #4 6851: 686 add DST, DST, #8 687 b 9f 6885: 689 str DATA1, [DST], #8 6909: 691 sub LEN, LEN, #8 692 add LEN, LEN, DST_ALIGNBIT, lsr #3 693#else /* BYTE_ORDER */ 694 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 695 ldp DATA1, DATA0, [SRC], #16 696 neg TMP_X, SRC_ALIGNBIT 697 lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 698 lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */ 699 orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */ 700 b 9f 7015: 702 ldr DATA0, [SRC], #8 703 lsl DATA1, DATA0, SRC_ALIGNBIT 7049: 705 706 cbz DST_ALIGNBIT, 5f 707 mov TMP_D, DST0 708 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */ 709 tbz TMP_D, #0, 1f 710 lsr TMP_X, DATA1, #56 711 strb TMP_Xw, [TMP_D], #1 7121: 713 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */ 714 tbz TMP_D, #1, 1f 715 lsr TMP_X, DATA1, #48 716 strh TMP_Xw, [TMP_D], #2 7171: 718 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */ 719 tbz TMP_D, #2, 1f 720 lsr TMP_X, DATA1, #32 721 str TMP_Xw, [TMP_D], #4 7221: 723 add DST, DST, #8 724 b 9f 7255: 726 str DATA1, [DST], #8 7279: 728 sub LEN, LEN, #8 729 add LEN, LEN, DST_ALIGNBIT, lsr #3 730#endif /* BYTE_ORDER */ 731 732shifting_copy_loop: 733 ldp DATA1, DATA2, [SRC], #16 734#if BYTE_ORDER == LITTLE_ENDIAN 735 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 736 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 737 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 738 orr DATA0, DATA0, TMP_X 739 /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */ 740 lsr DATA1, DATA1, SRC_DST_ALIGNBIT 741 lsl TMP_X, DATA2, DST_SRC_ALIGNBIT 742 orr DATA1, DATA1, TMP_X 743#else /* BYTE_ORDER */ 744 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 745 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 746 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 747 orr DATA0, DATA0, TMP_X 748 /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */ 749 lsl DATA1, DATA1, SRC_DST_ALIGNBIT 750 lsr TMP_X, DATA2, DST_SRC_ALIGNBIT 751 orr DATA1, DATA1, TMP_X 752#endif /* BYTE_ORDER */ 753 stp DATA0, DATA1, [DST], #16 754 mov DATA0, DATA2 755 sub LEN, LEN, #16 756 cmp LEN, #16 757 bhs shifting_copy_loop 758 759 760 /* write 8 bytes */ 761 tbz LEN, #3, 9f 762 ldr DATA1, [SRC], #8 763#if BYTE_ORDER == LITTLE_ENDIAN 764 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 765 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 766 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 767 orr DATA0, DATA0, TMP_X 768#else /* BYTE_ORDER */ 769 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 770 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 771 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 772 orr DATA0, DATA0, TMP_X 773#endif /* BYTE_ORDER */ 774 str DATA0, [DST], #8 775 mov DATA0, DATA1 776 sub LEN, LEN, #8 7779: 778 779 cbz LEN, shifting_copy_done 780 781 /* copy last 1-7 bytes */ 782 and TMP_X, DST_SRC_ALIGNBIT, #63 783 cmp LEN, TMP_X, lsr #3 784 bls 1f 785 ldr DATA1, [SRC], #8 /* don't access out of range */ 7861: 787 788#if BYTE_ORDER == LITTLE_ENDIAN 789 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 790 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 791 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 792 orr DATA0, DATA0, TMP_X 793#else /* BYTE_ORDER */ 794 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 795 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 796 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 797 orr DATA0, DATA0, TMP_X 798#endif /* BYTE_ORDER */ 799 800#if BYTE_ORDER == LITTLE_ENDIAN 801 /* if (len & 4) { *(uint32_t *)dst++ = data0; } */ 802 tbz LEN, #2, 1f 803 str DATA0w, [DST], #4 804 lsr DATA0, DATA0, #32 8051: 806 /* if (len & 2) { *(uint16_t *)dst++ = data0; } */ 807 tbz LEN, #1, 1f 808 strh DATA0w, [DST], #2 809 lsr DATA0, DATA0, #16 8101: 811 /* if (len & 1) { *(uint8_t *)dst++ = data0; } */ 812 tbz LEN, #0, 1f 813 strb DATA0w, [DST], #1 8141: 815#else /* BYTE_ORDER */ 816 /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */ 817 tbz LEN, #2, 1f 818 lsr TMP_X, DATA0, #32 819 str TMP_Xw, [DST], #4 8201: 821 /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */ 822 tbz LEN, #1, 1f 823 lsr TMP_X, DATA0, #16 824 strh TMP_Xw, [DST], #2 8251: 826 /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */ 827 tbz LEN, #0, 1f 828 lsr TMP_X, DATA0, #8 829 strb TMP_Xw, [DST], #1 8301: 831#endif /* BYTE_ORDER */ 832shifting_copy_done: 833 ret 834 835#else /* STRICT_ALIGNMENT */ 836#ifndef NO_OVERLAP 837 cbz LEN, done 838 cmp SRC0, DST0 839 beq done 840 bcc backward_ignore_align 841#endif /* NO_OVERLAP */ 842 843 prfm PLDL1KEEP, [SRC0] 844 cmp LEN, #SMALLSIZE 845 bcs copy_forward 846 mov DST, DST0 847 848copy_forward_small: 849 cmp LEN, #8 850 bcs 9f 851 852 /* 0 <= len < 8 */ 853 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 854 tbz LEN, #2, 1f 855 ldr TMP_Xw, [SRC0], #4 856 str TMP_Xw, [DST], #4 8571: 858 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 859 tbz LEN, #1, 1f 860 ldrh TMP_Xw, [SRC0], #2 861 strh TMP_Xw, [DST], #2 8621: 863 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 864 tbz LEN, #0, 1f 865 ldrb TMP_Xw, [SRC0], #1 866 strb TMP_Xw, [DST], #1 8671: 868 ret 8699: 870 871 prfm PLDL1KEEP, [SRC0, #8] 872 cmp LEN, #16 873 bcs 9f 874 875 /* 8 <= len < 16 */ 876 /* *(uint64_t *)dst++ = *(uint64_t *)src++; */ 877 ldr TMP_X, [SRC0], #8 878 str TMP_X, [DST], #8 879 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 880 tbz LEN, #2, 1f 881 ldr TMP_Xw, [SRC0], #4 882 str TMP_Xw, [DST], #4 8831: 884 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 885 tbz LEN, #1, 1f 886 ldrh TMP_Xw, [SRC0], #2 887 strh TMP_Xw, [DST], #2 8881: 889 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 890 tbz LEN, #0, 1f 891 ldrb TMP_Xw, [SRC0], #1 892 strb TMP_Xw, [DST], #1 8931: 894 ret 8959: 896 897 /* 16 <= len < 32 */ 898 prfm PLDL1KEEP, [SRC0, 16] 899 prfm PLDL1KEEP, [SRC0, 24] 900 ldp DATA0, DATA1, [SRC0], #16 901 stp DATA0, DATA1, [DST], #16 902 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 903 tbz LEN, #3, 1f 904 ldr TMP_X, [SRC0], #8 905 str TMP_X, [DST], #8 9061: 907 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 908 tbz LEN, #2, 1f 909 ldr TMP_Xw, [SRC0], #4 910 str TMP_Xw, [DST], #4 9111: 912 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 913 tbz LEN, #1, 1f 914 ldrh TMP_Xw, [SRC0], #2 915 strh TMP_Xw, [DST], #2 9161: 917 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 918 tbz LEN, #0, 1f 919 ldrb TMP_Xw, [SRC0], #1 920 strb TMP_Xw, [DST], #1 9211: 922 ret 923#endif /* !STRICT_ALIGNMENT */ 924 925 .align 4 926copy_forward: 927 /* DST is not aligned at this point */ 928 mov DST, DST0 929#ifndef STRICT_ALIGNMENT 930 cmp LEN, #512 /* pre-alignment can be overhead when small */ 931 bcc 9f 932#endif /* STRICT_ALIGNMENT */ 933 /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 934 tbz DST, #0, 1f 935 ldrb TMP_Xw, [SRC0], #1 936 strb TMP_Xw, [DST], #1 937 sub LEN, LEN, #1 9381: 939 /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 940 tbz DST, #1, 1f 941 ldrh TMP_Xw, [SRC0], #2 942 strh TMP_Xw, [DST], #2 943 sub LEN, LEN, #2 9441: 945 /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 946 tbz DST, #2, 1f 947 ldr TMP_Xw, [SRC0], #4 948 str TMP_Xw, [DST], #4 949 sub LEN, LEN, #4 9501: 951#if (STP_ALIGN > 8) 952 /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 953 tbz DST, #3, 1f 954 ldr TMP_X, [SRC0], #8 955 str TMP_X, [DST], #8 956 sub LEN, LEN, #8 9571: 958#endif /* (STP_ALIGN > 8) */ 9599: 960 961forward_copy1k: 962 /* while (len >= 1024) */ 963 /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */ 964 cmp LEN, #1024 965 blo 9f 9661: 967 sub LEN, LEN, #1024 968 .rept (1024 / 16) 969 ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */ 970 stp DATA0, DATA1, [DST], #16 971 .endr 972 cmp LEN, #1024 973 bhs 1b 9749: 975 976 /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */ 977 tbz LEN, #9, 1f 978 .rept (512 / 16) 979 ldp DATA0, DATA1, [SRC0], #16 980 stp DATA0, DATA1, [DST], #16 981 .endr 9821: 983 /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */ 984 tbz LEN, #8, 1f 985 .rept (256 / 16) 986 ldp DATA0, DATA1, [SRC0], #16 987 stp DATA0, DATA1, [DST], #16 988 .endr 9891: 990 /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */ 991 tbz LEN, #7, 1f 992 .rept (128 / 16) 993 ldp DATA0, DATA1, [SRC0], #16 994 stp DATA0, DATA1, [DST], #16 995 .endr 9961: 997 /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */ 998 tbz LEN, #6, 1f 999 .rept (64 / 16) 1000 ldp DATA0, DATA1, [SRC0], #16 1001 stp DATA0, DATA1, [DST], #16 1002 .endr 10031: 1004 /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */ 1005 tbz LEN, #5, 1f 1006 .rept (32 / 16) 1007 ldp DATA0, DATA1, [SRC0], #16 1008 stp DATA0, DATA1, [DST], #16 1009 .endr 10101: 1011 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 1012 tbz LEN, #4, 1f 1013 ldp DATA0, DATA1, [SRC0], #16 1014 stp DATA0, DATA1, [DST], #16 10151: 1016 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 1017 tbz LEN, #3, 1f 1018 ldr TMP_X, [SRC0], #8 1019 str TMP_X, [DST], #8 10201: 1021 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 1022 tbz LEN, #2, 1f 1023 ldr TMP_Xw, [SRC0], #4 1024 str TMP_Xw, [DST], #4 10251: 1026 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 1027 tbz LEN, #1, 1f 1028 ldrh TMP_Xw, [SRC0], #2 1029 strh TMP_Xw, [DST], #2 10301: 1031 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 1032 tbz LEN, #0, 1f 1033 ldrb TMP_Xw, [SRC0], #1 1034 strb TMP_Xw, [DST], #1 10351: 1036done: 1037 ret 1038END(FUNCTION) 1039