1/* $NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $ */ 2 3/* 4 * Copyright (c) 2018 Ryo Shimizu <ryo@nerv.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <machine/asm.h> 30 31#if defined(LIBC_SCCS) 32RCSID("$NetBSD: bcopy.S,v 1.1 2018/02/04 21:52:16 skrll Exp $") 33#endif 34 35#if defined(MEMCOPY) 36 37/* 38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len); 39 */ 40#define FUNCTION memcpy 41#define NO_OVERLAP 42#define SRC0 x1 43#define DST0 x0 44#define LEN x2 45 46#elif defined(MEMMOVE) 47 48/* 49 * void *memmove(void *dst, const void *src, size_t len); 50 */ 51#define FUNCTION memmove 52#undef NO_OVERLAP 53#define SRC0 x1 54#define DST0 x0 55#define LEN x2 56 57#else /* !MEMCOPY && !MEMMOVE */ 58 59/* 60 * void bcopy(const void *src, void *dst, size_t len); 61 */ 62#define FUNCTION bcopy 63#define NO_OVERLAP 64#define SRC0 x0 65#define DST0 x1 66#define LEN x2 67 68#endif /* MEMCOPY/MEMMOVE/BCOPY */ 69 70/* caller-saved temporary registers. breakable. */ 71#define TMP_X x3 72#define TMP_Xw w3 73#define TMP_D x4 74#define TMP_S x5 75#define DST x6 76#define SRC x7 77#define DATA0 x8 78#define DATA0w w8 79#define DATA1 x9 80#define DATA1w w9 81#define DATA2 x10 82#define SRC_ALIGNBIT x11 /* (SRC & 7) * 8 */ 83#define DST_ALIGNBIT x12 /* (DST & 7) * 8 */ 84#define SRC_DST_ALIGNBIT x13 /* = SRC_ALIGNBIT - DST_ALIGNBIT */ 85#define DST_SRC_ALIGNBIT x14 /* = -SRC_DST_ALIGNBIT */ 86 87#define STP_ALIGN 16 /* align before stp/ldp. 8 or 16 */ 88#define SMALLSIZE 32 89 90 .text 91 .align 5 92 93#ifndef NO_OVERLAP 94#ifndef STRICT_ALIGNMENT 95backward_ignore_align: 96 prfm PLDL1KEEP, [SRC0] 97 add SRC0, SRC0, LEN 98 add DST, DST0, LEN 99 cmp LEN, #SMALLSIZE 100 bcs copy_backward 101copy_backward_small: 102 cmp LEN, #8 103 bcs 9f 104 105 /* 0 <= len < 8 */ 106 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 107 tbz LEN, #2, 1f 108 ldr TMP_Xw, [SRC0, #-4]! 109 str TMP_Xw, [DST, #-4]! 1101: 111 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 112 tbz LEN, #1, 1f 113 ldrh TMP_Xw, [SRC0, #-2]! 114 strh TMP_Xw, [DST, #-2]! 1151: 116 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 117 tbz LEN, #0, 1f 118 ldrb TMP_Xw, [SRC0, #-1]! 119 strb TMP_Xw, [DST, #-1]! 1201: 121 ret 1229: 123 124 cmp LEN, #16 125 bcs 9f 126 127 /* 8 <= len < 16 */ 128 /* *--(uint64_t *)dst = *--(uint64_t *)src; */ 129 ldr TMP_X, [SRC0, #-8]! 130 str TMP_X, [DST, #-8]! 131 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 132 tbz LEN, #2, 1f 133 ldr TMP_Xw, [SRC0, #-4]! 134 str TMP_Xw, [DST, #-4]! 1351: 136 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 137 tbz LEN, #1, 1f 138 ldrh TMP_Xw, [SRC0, #-2]! 139 strh TMP_Xw, [DST, #-2]! 1401: 141 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 142 tbz LEN, #0, 1f 143 ldrb TMP_Xw, [SRC0, #-1]! 144 strb TMP_Xw, [DST, #-1]! 1451: 146 ret 1479: 148 149 /* 16 <= len < 32 */ 150 ldp DATA0, DATA1, [SRC0, #-16]! 151 stp DATA0, DATA1, [DST, #-16]! 152 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 153 tbz LEN, #3, 1f 154 ldr TMP_X, [SRC0, #-8]! 155 str TMP_X, [DST, #-8]! 1561: 157 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 158 tbz LEN, #2, 1f 159 ldr TMP_Xw, [SRC0, #-4]! 160 str TMP_Xw, [DST, #-4]! 1611: 162 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 163 tbz LEN, #1, 1f 164 ldrh TMP_Xw, [SRC0, #-2]! 165 strh TMP_Xw, [DST, #-2]! 1661: 167 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 168 tbz LEN, #0, 1f 169 ldrb TMP_Xw, [SRC0, #-1]! 170 strb TMP_Xw, [DST, #-1]! 1711: 172 ret 173#endif /* !STRICT_ALIGNMENT */ 174 175 .align 4 176copy_backward: 177 /* DST is not aligned at this point */ 178#ifndef STRICT_ALIGNMENT 179 cmp LEN, #512 /* pre-alignment can be overhead when small */ 180 bcc 9f 181#endif 182 /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 183 tbz DST, #0, 1f 184 ldrb TMP_Xw, [SRC0, #-1]! 185 strb TMP_Xw, [DST, #-1]! 186 sub LEN, LEN, #1 1871: 188 /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 189 tbz DST, #1, 1f 190 ldrh TMP_Xw, [SRC0, #-2]! 191 strh TMP_Xw, [DST, #-2]! 192 sub LEN, LEN, #2 1931: 194 /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 195 tbz DST, #2, 1f 196 ldr TMP_Xw, [SRC0, #-4]! 197 str TMP_Xw, [DST, #-4]! 198 sub LEN, LEN, #4 1991: 200#if (STP_ALIGN > 8) 201 /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 202 tbz DST, #3, 1f 203 ldr TMP_X, [SRC0, #-8]! 204 str TMP_X, [DST, #-8]! 205 sub LEN, LEN, #8 2061: 207#endif /* (STP_ALIGN > 8) */ 2089: 209 210 cmp LEN, #1024 211 bhs backward_copy1k 212backward_less1k: 213 /* copy 16*n bytes */ 214 and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */ 215 adr TMP_X, 8f 216 sub LEN, LEN, TMP_D 217 sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */ 218 br TMP_X 219backward_copy1k: /* copy 16*64 bytes */ 220 sub LEN, LEN, #1024 221 .rept (1024 / 16) 222 ldp DATA0, DATA1, [SRC0, #-16]! /* *--dst = *--src; */ 223 stp DATA0, DATA1, [DST, #-16]! 224 .endr 2258: 226 cbz LEN, done 227 cmp LEN, #1024 228 bhs backward_copy1k 229 cmp LEN, #16 230 bhs backward_less1k 231 232 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 233 tbz LEN, #4, 1f 234 ldp DATA0, DATA1, [SRC0, #-16]! 235 ldp DATA0, DATA1, [DST, #-16]! 2361: 237 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 238 tbz LEN, #3, 1f 239 ldr TMP_X, [SRC0, #-8]! 240 str TMP_X, [DST, #-8]! 2411: 242 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 243 tbz LEN, #2, 1f 244 ldr TMP_Xw, [SRC0, #-4]! 245 str TMP_Xw, [DST, #-4]! 2461: 247 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 248 tbz LEN, #1, 1f 249 ldrh TMP_Xw, [SRC0, #-2]! 250 strh TMP_Xw, [DST, #-2]! 2511: 252 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 253 tbz LEN, #0, 1f 254 ldrb TMP_Xw, [SRC0, #-1]! 255 strb TMP_Xw, [DST, #-1]! 2561: 257 ret 258#endif /* !NO_OVERLAP */ 259 260 261#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) 262 .align 5 263backward_copy: 264 prfm PLDL1KEEP, [SRC0] 265 add DST, DST0, LEN 266 add SRC0, SRC0, LEN 267 cmp LEN, #SMALLSIZE 268 bcs strict_backward 269 270 cmp LEN, #10 271 bcs 9f 272backward_tiny: 273 /* copy 1-10 bytes */ 274 adr TMP_X, 8f 275 sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */ 276 br TMP_X 277 .rept 10 278 ldrb TMP_Xw, [SRC0, #-1]! 279 strb TMP_Xw, [DST, #-1]! 280 .endr 2818: 282 ret 2839: 284 /* length is small(<32), and src or dst may be unaligned */ 285 eor TMP_X, SRC0, DST0 286 ands TMP_X, TMP_X, #7 287 bne notaligned_backward_small 288 289samealign_backward_small: 290 /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 291 tbz DST, #0, 1f 292 ldrb TMP_Xw, [SRC0, #-1]! 293 strb TMP_Xw, [DST, #-1]! 294 sub LEN, LEN, #1 2951: 296 /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 297 tbz DST, #1, 1f 298 ldrh TMP_Xw, [SRC0, #-2]! 299 strh TMP_Xw, [DST, #-2]! 300 sub LEN, LEN, #2 3011: 302 /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 303 tbz DST, #2, 1f 304 ldr TMP_Xw, [SRC0, #-4]! 305 str TMP_Xw, [DST, #-4]! 306 sub LEN, LEN, #4 3071: 308 /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */ 309 tbz LEN, #4, 1f 310 ldp DATA0, DATA1, [SRC0, #-16]! 311 stp DATA0, DATA1, [DST, #-16]! 3121: 313 /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */ 314 tbz LEN, #3, 1f 315 ldr TMP_X, [SRC0, #-8]! 316 str TMP_X, [DST, #-8]! 3171: 318 /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */ 319 tbz LEN, #2, 1f 320 ldr TMP_Xw, [SRC0, #-4]! 321 str TMP_Xw, [DST, #-4]! 3221: 323 /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */ 324 tbz LEN, #1, 1f 325 ldrh TMP_Xw, [SRC0, #-2]! 326 strh TMP_Xw, [DST, #-2]! 3271: 328 /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */ 329 tbz LEN, #0, 1f 330 ldrb TMP_Xw, [SRC0, #-1]! 331 strb TMP_Xw, [DST, #-1]! 3321: 333 ret 334 335notaligned_backward_small: 336 /* length is small, and src or dst may be unaligned */ 337 sub TMP_S, SRC0, LEN /* tmp_s = src - len */ 3381: /* do { */ 339 ldrb TMP_Xw, [SRC0, #-1]! 340 strb TMP_Xw, [DST, #-1]! /* *(char *)dst++ = *(char *)src++ */ 341 cmp TMP_S, SRC0 /* while (tmp_s < src) */ 342 blo 1b 343 ret 344 345strict_backward: 346 /* src or dst may be unaligned */ 347 and SRC_ALIGNBIT, SRC0, #7 348 and DST_ALIGNBIT, DST, #7 349 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 350 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 351 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 352 cbz SRC_DST_ALIGNBIT, copy_backward /* same alignment? */ 353 354 and SRC, SRC0, #~7 355 and DST, DST, #~7 356 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 357 358#if BYTE_ORDER == LITTLE_ENDIAN 359 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 360 361 cmp SRC, SRC0 /* don't access out of range */ 362 beq 1f 363 ldr DATA1, [SRC] 3641: 365 ldr DATA0, [SRC, #-8]! 366 367 lsl DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 368 lsr TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1<<dst_src_alignbit)| */ 369 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 370 371 b 9f /* } */ 3725: /* else { */ 373 ldr DATA0, [SRC] /* data0 = *src; */ 374 lsr DATA1, DATA0, SRC_DST_ALIGNBIT /* data1=data0>>src_dst_abit;*/ 3759: /* } */ 376 377 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 378 mov TMP_D, DST /* tmp_d = dst; */ 379 380 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 381 str DATA1w, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = data1; */ 382 lsr DATA1, DATA1, #32 /* data1 >>= 32; */ 3831: /* } */ 384 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 385 strh DATA1w, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = data1; */ 386 lsr DATA1, DATA1, #16 /* data1 >>= 16; */ 3871: /* } */ 388 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 389 strb DATA1w, [TMP_D] /* *(uint8_t *)tmp_d = data1; */ 3901: /* } */ 391 392 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 3939: /* } */ 394#else /* BYTE_ORDER */ 395 tbz SRC_DST_ALIGNBIT, #63, 5f /* if(SRC_DST_ALIGNBIT < 0) { */ 396 397 cmp SRC, SRC0 /* don't access out of range */ 398 beq 1f 399 ldr DATA1, [SRC] 4001: 401 ldr DATA0, [SRC, #-8]! 402 403 lsr DATA1, DATA1, DST_SRC_ALIGNBIT /* data1 = */ 404 lsl TMP_X, DATA0, SRC_DST_ALIGNBIT /* (data1>>dst_src_alignbit)| */ 405 orr DATA1, DATA1, TMP_X /* (data0<<src_dst_alignbit); */ 406 407 b 9f /* } */ 4085: /* else { */ 409 ldr DATA0, [SRC] /* data0 = *src; */ 410 lsr DATA1, DATA0, DST_SRC_ALIGNBIT /* data1=data0<<dst_src_abit;*/ 4119: /* } */ 412 413 cbz DST_ALIGNBIT, 9f /* if (dst_alignbit != 0) { */ 414 mov TMP_D, DST /* tmp_d = dst; */ 415 416 tbz DST_ALIGNBIT, #(2+3), 1f /* if (dst_ailgnbit & (4<<3)) { */ 417 lsr TMP_X, DATA1, #32 /* x = data1 >> 32; */ 418 str TMP_Xw, [TMP_D], #4 /* *(uint32_t *)tmp_d++ = x; */ 4191: /* } */ 420 tbz DST_ALIGNBIT, #(1+3), 1f /* if (dst_ailgnbit & (2<<3)) { */ 421 lsr TMP_X, DATA1, #16 /* x = data1 >> 16; */ 422 strh TMP_Xw, [TMP_D], #2 /* *(uint16_t *)tmp_d++ = x; */ 4231: /* } */ 424 tbz DST_ALIGNBIT, #(0+3), 1f /* if (dst_alignbit & (1<<3)) { */ 425 lsr TMP_X, DATA1, #8 /* x = data1 >> 8; */ 426 strb TMP_Xw, [TMP_D], #1 /* *(uint8_t *)tmp_d++ = x; */ 4271: /* } */ 428 429 sub LEN, LEN, DST_ALIGNBIT, lsr #3 /* len -=(dst_alignbit>>3); */ 4309: /* } */ 431#endif /* BYTE_ORDER */ 432 433 434backward_shifting_copy_loop: 435 ldp DATA2, DATA1, [SRC, #-16]! 436#if BYTE_ORDER == LITTLE_ENDIAN 437 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 438 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 439 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 440 orr DATA0, DATA0, TMP_X 441 /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */ 442 lsl DATA1, DATA1, DST_SRC_ALIGNBIT 443 lsr TMP_X, DATA2, SRC_DST_ALIGNBIT 444 orr DATA1, DATA1, TMP_X 445#else /* BYTE_ORDER */ 446 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 447 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 448 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 449 orr DATA0, DATA0, TMP_X 450 /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */ 451 lsr DATA1, DATA1, DST_SRC_ALIGNBIT 452 lsl TMP_X, DATA2, SRC_DST_ALIGNBIT 453 orr DATA1, DATA1, TMP_X 454#endif /* BYTE_ORDER */ 455 stp DATA1, DATA0, [DST, #-16]! 456 mov DATA0, DATA2 457 sub LEN, LEN, #16 458 cmp LEN, #16 459 bhs backward_shifting_copy_loop 460 461 462 /* write 8 bytes */ 463 tbz LEN, #3, 9f 464 465 ldr DATA1, [SRC, #-8]! 466#if BYTE_ORDER == LITTLE_ENDIAN 467 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 468 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 469 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 470 orr DATA0, DATA0, TMP_X 471#else /* BYTE_ORDER */ 472 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 473 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 474 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 475 orr DATA0, DATA0, TMP_X 476#endif /* BYTE_ORDER */ 477 str DATA0, [DST, #-8]! 478 mov DATA0, DATA1 479 sub LEN, LEN, #8 4809: 481 482 cbz LEN, backward_shifting_copy_done 483 484 /* copy last 1-7 bytes */ 485 and TMP_X, SRC_DST_ALIGNBIT, #63 486 cmp LEN, TMP_X, lsr #3 487 bls 1f 488 ldr DATA1, [SRC, #-8]! /* don't access out of range */ 4891: 490 491#if BYTE_ORDER == LITTLE_ENDIAN 492 /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */ 493 lsl DATA0, DATA0, DST_SRC_ALIGNBIT 494 lsr TMP_X, DATA1, SRC_DST_ALIGNBIT 495 orr DATA0, DATA0, TMP_X 496#else /* BYTE_ORDER */ 497 /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */ 498 lsr DATA0, DATA0, DST_SRC_ALIGNBIT 499 lsl TMP_X, DATA1, SRC_DST_ALIGNBIT 500 orr DATA0, DATA0, TMP_X 501#endif /* BYTE_ORDER */ 502 503#if BYTE_ORDER == LITTLE_ENDIAN 504 tbz LEN, #2, 1f 505 ror DATA0, DATA0, #32 506 str DATA0w, [DST, #-4]! 5071: 508 tbz LEN, #1, 1f 509 ror DATA0, DATA0, #48 510 strh DATA0w, [DST, #-2]! 5111: 512 tbz LEN, #0, 1f 513 ror DATA0, DATA0, #56 514 strb DATA0w, [DST, #-1]! 5151: 516#else /* BYTE_ORDER */ 517 tbz LEN, #2, 1f 518 str DATA0w, [DST, #-4]! 519 lsr DATA0, DATA0, #32 5201: 521 tbz LEN, #1, 1f 522 strh DATA0w, [DST, #-2]! 523 lsr DATA0, DATA0, #16 5241: 525 tbz LEN, #0, 1f 526 strb DATA0w, [DST, #-1]! 5271: 528#endif /* BYTE_ORDER */ 529backward_shifting_copy_done: 530 ret 531#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */ 532 533 534 .align 5 535ENTRY(FUNCTION) 536#ifdef STRICT_ALIGNMENT 537 cbz LEN, done 538#ifndef NO_OVERLAP 539 cmp SRC0, DST0 540 beq done 541 bcc backward_copy 542#endif /* NO_OVERLAP */ 543 mov DST, DST0 544 cmp LEN, #SMALLSIZE 545 bcs strict_forward 546 547 cmp LEN, #10 548 bcs 9f 549forward_tiny: 550 /* copy 1-10 bytes */ 551 adr TMP_X, 8f 552 sub TMP_X, TMP_X, LEN, lsl #3 /* jump to (8f - len*2) */ 553 br TMP_X 554 .rept 10 555 ldrb TMP_Xw, [SRC0], #1 556 strb TMP_Xw, [DST], #1 557 .endr 5588: 559 ret 5609: 561 /* length is small(<32), and src or dst may be unaligned */ 562 eor TMP_X, SRC0, DST0 563 ands TMP_X, TMP_X, #7 564 bne notaligned_forward_small 565samealign_forward_small: 566 /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 567 tbz DST, #0, 1f 568 ldrb TMP_Xw, [SRC0], #1 569 strb TMP_Xw, [DST], #1 570 sub LEN, LEN, #1 5711: 572 /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 573 tbz DST, #1, 1f 574 ldrh TMP_Xw, [SRC0], #2 575 strh TMP_Xw, [DST], #2 576 sub LEN, LEN, #2 5771: 578 /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 579 tbz DST, #2, 1f 580 ldr TMP_Xw, [SRC0], #4 581 str TMP_Xw, [DST], #4 582 sub LEN, LEN, #4 5831: 584 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 585 tbz LEN, #4, 1f 586 ldp DATA0, DATA1, [SRC0], #16 587 stp DATA0, DATA1, [DST], #16 5881: 589 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 590 tbz LEN, #3, 1f 591 ldr TMP_X, [SRC0], #8 592 str TMP_X, [DST], #8 5931: 594 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 595 tbz LEN, #2, 1f 596 ldr TMP_Xw, [SRC0], #4 597 str TMP_Xw, [DST], #4 5981: 599 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 600 tbz LEN, #1, 1f 601 ldrh TMP_Xw, [SRC0], #2 602 strh TMP_Xw, [DST], #2 6031: 604 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 605 tbz LEN, #0, 1f 606 ldrb TMP_Xw, [SRC0], #1 607 strb TMP_Xw, [DST], #1 6081: 609 ret 610 611notaligned_forward_small: 612 /* src and dst are not aligned... */ 613 prfm PLDL1KEEP, [SRC0] 614 prfm PLDL1KEEP, [SRC0, #8] 615 prfm PLDL1KEEP, [SRC0, #16] 616 add TMP_S, SRC0, LEN /* tmp_s = src + len */ 6171: /* do { */ 618 ldrb TMP_Xw, [SRC0], #1 619 strb TMP_Xw, [DST], #1 /* *(char *)dst++ = *(char *)src++ */ 620 cmp SRC0, TMP_S /* while (src < tmp_s); */ 621 blo 1b 622 ret 623 624strict_forward: 625 /* src or dst may be unaligned */ 626 and SRC_ALIGNBIT, SRC0, #7 627 and DST_ALIGNBIT, DST0, #7 628 lsl SRC_ALIGNBIT, SRC_ALIGNBIT, #3 629 lsl DST_ALIGNBIT, DST_ALIGNBIT, #3 630 sub SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT 631 cbz SRC_DST_ALIGNBIT, copy_forward /* same alignment? */ 632 633 and SRC, SRC0, #~7 634 and DST, DST0, #~7 635 neg DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT 636 637#if BYTE_ORDER == LITTLE_ENDIAN 638 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 639 ldp DATA1, DATA0, [SRC], #16 640 neg TMP_X, SRC_ALIGNBIT 641 lsr DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 642 lsl TMP_X, DATA0, TMP_X /* (data1 >> src_alignbit) | */ 643 orr DATA1, DATA1, TMP_X /* (data0 << -src_alignbit); */ 644 b 9f 6455: 646 ldr DATA0, [SRC], #8 647 lsr DATA1, DATA0, SRC_ALIGNBIT 6489: 649 650 cbz DST_ALIGNBIT, 5f 651 mov TMP_D, DST0 652 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */ 653 tbz TMP_D, #0, 1f 654 strb DATA1w, [TMP_D], #1 655 lsr DATA1, DATA1, #8 6561: 657 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */ 658 tbz TMP_D, #1, 1f 659 strh DATA1w, [TMP_D], #2 660 lsr DATA1, DATA1, #16 6611: 662 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */ 663 tbz TMP_D, #2, 1f 664 str DATA1w, [TMP_D], #4 6651: 666 add DST, DST, #8 667 b 9f 6685: 669 str DATA1, [DST], #8 6709: 671 sub LEN, LEN, #8 672 add LEN, LEN, DST_ALIGNBIT, lsr #3 673#else /* BYTE_ORDER */ 674 tbz DST_SRC_ALIGNBIT, #63, 5f /* if(DST_SRC_ALIGNBIT < 0) { */ 675 ldp DATA1, DATA0, [SRC], #16 676 neg TMP_X, SRC_ALIGNBIT 677 lsl DATA1, DATA1, SRC_ALIGNBIT /* data1 = */ 678 lsr TMP_X, DATA0, TMP_X /* (data1 << src_alignbit) | */ 679 orr DATA1, DATA1, TMP_X /* (data0 >> -src_alignbit); */ 680 b 9f 6815: 682 ldr DATA0, [SRC], #8 683 lsl DATA1, DATA0, SRC_ALIGNBIT 6849: 685 686 cbz DST_ALIGNBIT, 5f 687 mov TMP_D, DST0 688 /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */ 689 tbz TMP_D, #0, 1f 690 lsr TMP_X, DATA1, #56 691 strb TMP_Xw, [TMP_D], #1 6921: 693 /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */ 694 tbz TMP_D, #1, 1f 695 lsr TMP_X, DATA1, #48 696 strh TMP_Xw, [TMP_D], #2 6971: 698 /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */ 699 tbz TMP_D, #2, 1f 700 lsr TMP_X, DATA1, #32 701 str TMP_Xw, [TMP_D], #4 7021: 703 add DST, DST, #8 704 b 9f 7055: 706 str DATA1, [DST], #8 7079: 708 sub LEN, LEN, #8 709 add LEN, LEN, DST_ALIGNBIT, lsr #3 710#endif /* BYTE_ORDER */ 711 712shifting_copy_loop: 713 ldp DATA1, DATA2, [SRC], #16 714#if BYTE_ORDER == LITTLE_ENDIAN 715 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 716 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 717 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 718 orr DATA0, DATA0, TMP_X 719 /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */ 720 lsr DATA1, DATA1, SRC_DST_ALIGNBIT 721 lsl TMP_X, DATA2, DST_SRC_ALIGNBIT 722 orr DATA1, DATA1, TMP_X 723#else /* BYTE_ORDER */ 724 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 725 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 726 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 727 orr DATA0, DATA0, TMP_X 728 /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */ 729 lsl DATA1, DATA1, SRC_DST_ALIGNBIT 730 lsr TMP_X, DATA2, DST_SRC_ALIGNBIT 731 orr DATA1, DATA1, TMP_X 732#endif /* BYTE_ORDER */ 733 stp DATA0, DATA1, [DST], #16 734 mov DATA0, DATA2 735 sub LEN, LEN, #16 736 cmp LEN, #16 737 bhs shifting_copy_loop 738 739 740 /* write 8 bytes */ 741 tbz LEN, #3, 9f 742 ldr DATA1, [SRC], #8 743#if BYTE_ORDER == LITTLE_ENDIAN 744 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 745 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 746 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 747 orr DATA0, DATA0, TMP_X 748#else /* BYTE_ORDER */ 749 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 750 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 751 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 752 orr DATA0, DATA0, TMP_X 753#endif /* BYTE_ORDER */ 754 str DATA0, [DST], #8 755 mov DATA0, DATA1 756 sub LEN, LEN, #8 7579: 758 759 cbz LEN, shifting_copy_done 760 761 /* copy last 1-7 bytes */ 762 and TMP_X, DST_SRC_ALIGNBIT, #63 763 cmp LEN, TMP_X, lsr #3 764 bls 1f 765 ldr DATA1, [SRC], #8 /* don't access out of range */ 7661: 767 768#if BYTE_ORDER == LITTLE_ENDIAN 769 /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */ 770 lsr DATA0, DATA0, SRC_DST_ALIGNBIT 771 lsl TMP_X, DATA1, DST_SRC_ALIGNBIT 772 orr DATA0, DATA0, TMP_X 773#else /* BYTE_ORDER */ 774 /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */ 775 lsl DATA0, DATA0, SRC_DST_ALIGNBIT 776 lsr TMP_X, DATA1, DST_SRC_ALIGNBIT 777 orr DATA0, DATA0, TMP_X 778#endif /* BYTE_ORDER */ 779 780#if BYTE_ORDER == LITTLE_ENDIAN 781 /* if (len & 4) { *(uint32_t *)dst++ = data0; } */ 782 tbz LEN, #2, 1f 783 str DATA0w, [DST], #4 784 lsr DATA0, DATA0, #32 7851: 786 /* if (len & 2) { *(uint16_t *)dst++ = data0; } */ 787 tbz LEN, #1, 1f 788 strh DATA0w, [DST], #2 789 lsr DATA0, DATA0, #16 7901: 791 /* if (len & 1) { *(uint8_t *)dst++ = data0; } */ 792 tbz LEN, #0, 1f 793 strb DATA0w, [DST], #1 7941: 795#else /* BYTE_ORDER */ 796 /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */ 797 tbz LEN, #2, 1f 798 lsr TMP_X, DATA0, #32 799 str TMP_Xw, [DST], #4 8001: 801 /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */ 802 tbz LEN, #1, 1f 803 lsr TMP_X, DATA0, #16 804 strh TMP_Xw, [DST], #2 8051: 806 /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */ 807 tbz LEN, #0, 1f 808 lsr TMP_X, DATA0, #8 809 strb TMP_Xw, [DST], #1 8101: 811#endif /* BYTE_ORDER */ 812shifting_copy_done: 813 ret 814 815#else /* STRICT_ALIGNMENT */ 816#ifndef NO_OVERLAP 817 cbz LEN, done 818 cmp SRC0, DST0 819 beq done 820 bcc backward_ignore_align 821#endif /* NO_OVERLAP */ 822 823 prfm PLDL1KEEP, [SRC0] 824 cmp LEN, #SMALLSIZE 825 bcs copy_forward 826 mov DST, DST0 827 828copy_forward_small: 829 cmp LEN, #8 830 bcs 9f 831 832 /* 0 <= len < 8 */ 833 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 834 tbz LEN, #2, 1f 835 ldr TMP_Xw, [SRC0], #4 836 str TMP_Xw, [DST], #4 8371: 838 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 839 tbz LEN, #1, 1f 840 ldrh TMP_Xw, [SRC0], #2 841 strh TMP_Xw, [DST], #2 8421: 843 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 844 tbz LEN, #0, 1f 845 ldrb TMP_Xw, [SRC0], #1 846 strb TMP_Xw, [DST], #1 8471: 848 ret 8499: 850 851 prfm PLDL1KEEP, [SRC0, #8] 852 cmp LEN, #16 853 bcs 9f 854 855 /* 8 <= len < 16 */ 856 /* *(uint64_t *)dst++ = *(uint64_t *)src++; */ 857 ldr TMP_X, [SRC0], #8 858 str TMP_X, [DST], #8 859 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 860 tbz LEN, #2, 1f 861 ldr TMP_Xw, [SRC0], #4 862 str TMP_Xw, [DST], #4 8631: 864 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 865 tbz LEN, #1, 1f 866 ldrh TMP_Xw, [SRC0], #2 867 strh TMP_Xw, [DST], #2 8681: 869 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 870 tbz LEN, #0, 1f 871 ldrb TMP_Xw, [SRC0], #1 872 strb TMP_Xw, [DST], #1 8731: 874 ret 8759: 876 877 /* 16 <= len < 32 */ 878 prfm PLDL1KEEP, [SRC0, 16] 879 prfm PLDL1KEEP, [SRC0, 24] 880 ldp DATA0, DATA1, [SRC0], #16 881 stp DATA0, DATA1, [DST], #16 882 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 883 tbz LEN, #3, 1f 884 ldr TMP_X, [SRC0], #8 885 str TMP_X, [DST], #8 8861: 887 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 888 tbz LEN, #2, 1f 889 ldr TMP_Xw, [SRC0], #4 890 str TMP_Xw, [DST], #4 8911: 892 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 893 tbz LEN, #1, 1f 894 ldrh TMP_Xw, [SRC0], #2 895 strh TMP_Xw, [DST], #2 8961: 897 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 898 tbz LEN, #0, 1f 899 ldrb TMP_Xw, [SRC0], #1 900 strb TMP_Xw, [DST], #1 9011: 902 ret 903#endif /* !STRICT_ALIGNMENT */ 904 905 .align 4 906copy_forward: 907 /* DST is not aligned at this point */ 908 mov DST, DST0 909#ifndef STRICT_ALIGNMENT 910 cmp LEN, #512 /* pre-alignment can be overhead when small */ 911 bcc 9f 912#endif /* STRICT_ALIGNMENT */ 913 /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 914 tbz DST, #0, 1f 915 ldrb TMP_Xw, [SRC0], #1 916 strb TMP_Xw, [DST], #1 917 sub LEN, LEN, #1 9181: 919 /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 920 tbz DST, #1, 1f 921 ldrh TMP_Xw, [SRC0], #2 922 strh TMP_Xw, [DST], #2 923 sub LEN, LEN, #2 9241: 925 /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 926 tbz DST, #2, 1f 927 ldr TMP_Xw, [SRC0], #4 928 str TMP_Xw, [DST], #4 929 sub LEN, LEN, #4 9301: 931#if (STP_ALIGN > 8) 932 /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 933 tbz DST, #3, 1f 934 ldr TMP_X, [SRC0], #8 935 str TMP_X, [DST], #8 936 sub LEN, LEN, #8 9371: 938#endif /* (STP_ALIGN > 8) */ 9399: 940 941 cmp LEN, #1024 942 bhs forward_copy1k 943forward_less1k: 944 /* copy 16*n bytes */ 945 and TMP_D, LEN, #(1023-15) /* len &= 1023; len &= ~15; */ 946 adr TMP_X, 8f 947 sub LEN, LEN, TMP_D 948 sub TMP_X, TMP_X, TMP_D, lsr #1 /* jump to (8f - len/2) */ 949 br TMP_X 950forward_copy1k: /* copy 16*64 bytes */ 951 sub LEN, LEN, #1024 952 .rept (1024 / 16) 953 ldp DATA0, DATA1, [SRC0], #16 /* *dst++ = *src++; */ 954 stp DATA0, DATA1, [DST], #16 955 .endr 9568: 957 cbz LEN, done 958 cmp LEN, #1024 959 bhs forward_copy1k 960 cmp LEN, #16 961 bhs forward_less1k 962 963 /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */ 964 tbz LEN, #4, 1f 965 ldp DATA0, DATA1, [SRC0], #16 966 stp DATA0, DATA1, [DST], #16 9671: 968 /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */ 969 tbz LEN, #3, 1f 970 ldr TMP_X, [SRC0], #8 971 str TMP_X, [DST], #8 9721: 973 /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */ 974 tbz LEN, #2, 1f 975 ldr TMP_Xw, [SRC0], #4 976 str TMP_Xw, [DST], #4 9771: 978 /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */ 979 tbz LEN, #1, 1f 980 ldrh TMP_Xw, [SRC0], #2 981 strh TMP_Xw, [DST], #2 9821: 983 /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */ 984 tbz LEN, #0, 1f 985 ldrb TMP_Xw, [SRC0], #1 986 strb TMP_Xw, [DST], #1 9871: 988done: 989 ret 990END(FUNCTION) 991