1/* $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $ */ 2/* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40#include <machine/asm.h> 41 42/* 43 * This is one fun bit of code ... 44 * Some easy listening music is suggested while trying to understand this 45 * code e.g. Iron Maiden 46 * 47 * For anyone attempting to understand it : 48 * 49 * The core code is implemented here with simple stubs for memcpy() 50 * memmove() and bcopy(). 51 * 52 * All local labels are prefixed with Lmemcpy_ 53 * Following the prefix a label starting f is used in the forward copy code 54 * while a label using b is used in the backwards copy code 55 * The source and destination addresses determine whether a forward or 56 * backward copy is performed. 57 * Separate bits of code are used to deal with the following situations 58 * for both the forward and backwards copy. 59 * unaligned source address 60 * unaligned destination address 61 * Separate copy routines are used to produce an optimised result for each 62 * of these cases. 63 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 64 * a time where possible. 65 * 66 * Note: r12 (aka ip) can be trashed during the function along with 67 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 68 * Additional registers are preserved prior to use i.e. r4, r5 & lr 69 * 70 * Apologies for the state of the comments ;-) 71 */ 72 73ENTRY(memcpy) 74ENTRY_NP(memmove) 75 /* Determine copy direction */ 76 cmp r1, r0 77 78 moveq r0, #0 /* Quick abort for len=0 */ 79#ifdef __APCS_26__ 80 moveqs pc, lr 81#else 82 moveq pc, lr 83#endif 84 85 /* save leaf functions having to store this away */ 86 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 87 88 bcc Lmemcpy_backwards 89 90 /* start of forwards copy */ 91 subs r2, r2, #4 92 blt Lmemcpy_fl4 /* less than 4 bytes */ 93 ands r12, r0, #3 94 bne Lmemcpy_fdestul /* oh unaligned destination addr */ 95 ands r12, r1, #3 96 bne Lmemcpy_fsrcul /* oh unaligned source addr */ 97 98Lmemcpy_ft8: 99 /* We have aligned source and destination */ 100 subs r2, r2, #8 101 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 102 subs r2, r2, #0x14 103 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 104 stmdb sp!, {r4} /* borrow r4 */ 105 106 /* blat 32 bytes at a time */ 107 /* XXX for really big copies perhaps we should use more registers */ 108Lmemcpy_floop32: 109 ldmia r1!, {r3, r4, r12, lr} 110 stmia r0!, {r3, r4, r12, lr} 111 ldmia r1!, {r3, r4, r12, lr} 112 stmia r0!, {r3, r4, r12, lr} 113 subs r2, r2, #0x20 114 bge Lmemcpy_floop32 115 116 cmn r2, #0x10 117 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 118 stmgeia r0!, {r3, r4, r12, lr} 119 subge r2, r2, #0x10 120 ldmia sp!, {r4} /* return r4 */ 121 122Lmemcpy_fl32: 123 adds r2, r2, #0x14 124 125 /* blat 12 bytes at a time */ 126Lmemcpy_floop12: 127 ldmgeia r1!, {r3, r12, lr} 128 stmgeia r0!, {r3, r12, lr} 129 subges r2, r2, #0x0c 130 bge Lmemcpy_floop12 131 132Lmemcpy_fl12: 133 adds r2, r2, #8 134 blt Lmemcpy_fl4 135 136 subs r2, r2, #4 137 ldrlt r3, [r1], #4 138 strlt r3, [r0], #4 139 ldmgeia r1!, {r3, r12} 140 stmgeia r0!, {r3, r12} 141 subge r2, r2, #4 142 143Lmemcpy_fl4: 144 /* less than 4 bytes to go */ 145 adds r2, r2, #4 146#ifdef __APCS_26_ 147 ldmeqia sp!, {r0, pc}^ /* done */ 148#else 149 ldmeqia sp!, {r0, pc} /* done */ 150#endif 151 /* copy the crud byte at a time */ 152 cmp r2, #2 153 ldrb r3, [r1], #1 154 strb r3, [r0], #1 155 ldrgeb r3, [r1], #1 156 strgeb r3, [r0], #1 157 ldrgtb r3, [r1], #1 158 strgtb r3, [r0], #1 159#ifdef __APCS_26__ 160 ldmia sp!, {r0, pc}^ 161#else 162 ldmia sp!, {r0, pc} 163#endif 164 165 /* erg - unaligned destination */ 166Lmemcpy_fdestul: 167 rsb r12, r12, #4 168 cmp r12, #2 169 170 /* align destination with byte copies */ 171 ldrb r3, [r1], #1 172 strb r3, [r0], #1 173 ldrgeb r3, [r1], #1 174 strgeb r3, [r0], #1 175 ldrgtb r3, [r1], #1 176 strgtb r3, [r0], #1 177 subs r2, r2, r12 178 blt Lmemcpy_fl4 /* less the 4 bytes */ 179 180 ands r12, r1, #3 181 beq Lmemcpy_ft8 /* we have an aligned source */ 182 183 /* erg - unaligned source */ 184 /* This is where it gets nasty ... */ 185Lmemcpy_fsrcul: 186 bic r1, r1, #3 187 ldr lr, [r1], #4 188 cmp r12, #2 189 bgt Lmemcpy_fsrcul3 190 beq Lmemcpy_fsrcul2 191 cmp r2, #0x0c 192 blt Lmemcpy_fsrcul1loop4 193 sub r2, r2, #0x0c 194 stmdb sp!, {r4, r5} 195 196Lmemcpy_fsrcul1loop16: 197 mov r3, lr, lsr #8 198 ldmia r1!, {r4, r5, r12, lr} 199 orr r3, r3, r4, lsl #24 200 mov r4, r4, lsr #8 201 orr r4, r4, r5, lsl #24 202 mov r5, r5, lsr #8 203 orr r5, r5, r12, lsl #24 204 mov r12, r12, lsr #8 205 orr r12, r12, lr, lsl #24 206 stmia r0!, {r3-r5, r12} 207 subs r2, r2, #0x10 208 bge Lmemcpy_fsrcul1loop16 209 ldmia sp!, {r4, r5} 210 adds r2, r2, #0x0c 211 blt Lmemcpy_fsrcul1l4 212 213Lmemcpy_fsrcul1loop4: 214 mov r12, lr, lsr #8 215 ldr lr, [r1], #4 216 orr r12, r12, lr, lsl #24 217 str r12, [r0], #4 218 subs r2, r2, #4 219 bge Lmemcpy_fsrcul1loop4 220 221Lmemcpy_fsrcul1l4: 222 sub r1, r1, #3 223 b Lmemcpy_fl4 224 225Lmemcpy_fsrcul2: 226 cmp r2, #0x0c 227 blt Lmemcpy_fsrcul2loop4 228 sub r2, r2, #0x0c 229 stmdb sp!, {r4, r5} 230 231Lmemcpy_fsrcul2loop16: 232 mov r3, lr, lsr #16 233 ldmia r1!, {r4, r5, r12, lr} 234 orr r3, r3, r4, lsl #16 235 mov r4, r4, lsr #16 236 orr r4, r4, r5, lsl #16 237 mov r5, r5, lsr #16 238 orr r5, r5, r12, lsl #16 239 mov r12, r12, lsr #16 240 orr r12, r12, lr, lsl #16 241 stmia r0!, {r3-r5, r12} 242 subs r2, r2, #0x10 243 bge Lmemcpy_fsrcul2loop16 244 ldmia sp!, {r4, r5} 245 adds r2, r2, #0x0c 246 blt Lmemcpy_fsrcul2l4 247 248Lmemcpy_fsrcul2loop4: 249 mov r12, lr, lsr #16 250 ldr lr, [r1], #4 251 orr r12, r12, lr, lsl #16 252 str r12, [r0], #4 253 subs r2, r2, #4 254 bge Lmemcpy_fsrcul2loop4 255 256Lmemcpy_fsrcul2l4: 257 sub r1, r1, #2 258 b Lmemcpy_fl4 259 260Lmemcpy_fsrcul3: 261 cmp r2, #0x0c 262 blt Lmemcpy_fsrcul3loop4 263 sub r2, r2, #0x0c 264 stmdb sp!, {r4, r5} 265 266Lmemcpy_fsrcul3loop16: 267 mov r3, lr, lsr #24 268 ldmia r1!, {r4, r5, r12, lr} 269 orr r3, r3, r4, lsl #8 270 mov r4, r4, lsr #24 271 orr r4, r4, r5, lsl #8 272 mov r5, r5, lsr #24 273 orr r5, r5, r12, lsl #8 274 mov r12, r12, lsr #24 275 orr r12, r12, lr, lsl #8 276 stmia r0!, {r3-r5, r12} 277 subs r2, r2, #0x10 278 bge Lmemcpy_fsrcul3loop16 279 ldmia sp!, {r4, r5} 280 adds r2, r2, #0x0c 281 blt Lmemcpy_fsrcul3l4 282 283Lmemcpy_fsrcul3loop4: 284 mov r12, lr, lsr #24 285 ldr lr, [r1], #4 286 orr r12, r12, lr, lsl #8 287 str r12, [r0], #4 288 subs r2, r2, #4 289 bge Lmemcpy_fsrcul3loop4 290 291Lmemcpy_fsrcul3l4: 292 sub r1, r1, #1 293 b Lmemcpy_fl4 294 295Lmemcpy_backwards: 296 add r1, r1, r2 297 add r0, r0, r2 298 subs r2, r2, #4 299 blt Lmemcpy_bl4 /* less than 4 bytes */ 300 ands r12, r0, #3 301 bne Lmemcpy_bdestul /* oh unaligned destination addr */ 302 ands r12, r1, #3 303 bne Lmemcpy_bsrcul /* oh unaligned source addr */ 304 305Lmemcpy_bt8: 306 /* We have aligned source and destination */ 307 subs r2, r2, #8 308 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 309 stmdb sp!, {r4} 310 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 311 blt Lmemcpy_bl32 312 313 /* blat 32 bytes at a time */ 314 /* XXX for really big copies perhaps we should use more registers */ 315Lmemcpy_bloop32: 316 ldmdb r1!, {r3, r4, r12, lr} 317 stmdb r0!, {r3, r4, r12, lr} 318 ldmdb r1!, {r3, r4, r12, lr} 319 stmdb r0!, {r3, r4, r12, lr} 320 subs r2, r2, #0x20 321 bge Lmemcpy_bloop32 322 323Lmemcpy_bl32: 324 cmn r2, #0x10 325 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 326 stmgedb r0!, {r3, r4, r12, lr} 327 subge r2, r2, #0x10 328 adds r2, r2, #0x14 329 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 330 stmgedb r0!, {r3, r12, lr} 331 subge r2, r2, #0x0c 332 ldmia sp!, {r4} 333 334Lmemcpy_bl12: 335 adds r2, r2, #8 336 blt Lmemcpy_bl4 337 subs r2, r2, #4 338 ldrlt r3, [r1, #-4]! 339 strlt r3, [r0, #-4]! 340 ldmgedb r1!, {r3, r12} 341 stmgedb r0!, {r3, r12} 342 subge r2, r2, #4 343 344Lmemcpy_bl4: 345 /* less than 4 bytes to go */ 346 adds r2, r2, #4 347#ifdef __APCS_26__ 348 ldmeqia sp!, {r0, pc}^ 349#else 350 ldmeqia sp!, {r0, pc} 351#endif 352 353 /* copy the crud byte at a time */ 354 cmp r2, #2 355 ldrb r3, [r1, #-1]! 356 strb r3, [r0, #-1]! 357 ldrgeb r3, [r1, #-1]! 358 strgeb r3, [r0, #-1]! 359 ldrgtb r3, [r1, #-1]! 360 strgtb r3, [r0, #-1]! 361#ifdef __APCS_26__ 362 ldmia sp!, {r0, pc}^ 363#else 364 ldmia sp!, {r0, pc} 365#endif 366 367 /* erg - unaligned destination */ 368Lmemcpy_bdestul: 369 cmp r12, #2 370 371 /* align destination with byte copies */ 372 ldrb r3, [r1, #-1]! 373 strb r3, [r0, #-1]! 374 ldrgeb r3, [r1, #-1]! 375 strgeb r3, [r0, #-1]! 376 ldrgtb r3, [r1, #-1]! 377 strgtb r3, [r0, #-1]! 378 subs r2, r2, r12 379 blt Lmemcpy_bl4 /* less than 4 bytes to go */ 380 ands r12, r1, #3 381 beq Lmemcpy_bt8 /* we have an aligned source */ 382 383 /* erg - unaligned source */ 384 /* This is where it gets nasty ... */ 385Lmemcpy_bsrcul: 386 bic r1, r1, #3 387 ldr r3, [r1, #0] 388 cmp r12, #2 389 blt Lmemcpy_bsrcul1 390 beq Lmemcpy_bsrcul2 391 cmp r2, #0x0c 392 blt Lmemcpy_bsrcul3loop4 393 sub r2, r2, #0x0c 394 stmdb sp!, {r4, r5} 395 396Lmemcpy_bsrcul3loop16: 397 mov lr, r3, lsl #8 398 ldmdb r1!, {r3-r5, r12} 399 orr lr, lr, r12, lsr #24 400 mov r12, r12, lsl #8 401 orr r12, r12, r5, lsr #24 402 mov r5, r5, lsl #8 403 orr r5, r5, r4, lsr #24 404 mov r4, r4, lsl #8 405 orr r4, r4, r3, lsr #24 406 stmdb r0!, {r4, r5, r12, lr} 407 subs r2, r2, #0x10 408 bge Lmemcpy_bsrcul3loop16 409 ldmia sp!, {r4, r5} 410 adds r2, r2, #0x0c 411 blt Lmemcpy_bsrcul3l4 412 413Lmemcpy_bsrcul3loop4: 414 mov r12, r3, lsl #8 415 ldr r3, [r1, #-4]! 416 orr r12, r12, r3, lsr #24 417 str r12, [r0, #-4]! 418 subs r2, r2, #4 419 bge Lmemcpy_bsrcul3loop4 420 421Lmemcpy_bsrcul3l4: 422 add r1, r1, #3 423 b Lmemcpy_bl4 424 425Lmemcpy_bsrcul2: 426 cmp r2, #0x0c 427 blt Lmemcpy_bsrcul2loop4 428 sub r2, r2, #0x0c 429 stmdb sp!, {r4, r5} 430 431Lmemcpy_bsrcul2loop16: 432 mov lr, r3, lsl #16 433 ldmdb r1!, {r3-r5, r12} 434 orr lr, lr, r12, lsr #16 435 mov r12, r12, lsl #16 436 orr r12, r12, r5, lsr #16 437 mov r5, r5, lsl #16 438 orr r5, r5, r4, lsr #16 439 mov r4, r4, lsl #16 440 orr r4, r4, r3, lsr #16 441 stmdb r0!, {r4, r5, r12, lr} 442 subs r2, r2, #0x10 443 bge Lmemcpy_bsrcul2loop16 444 ldmia sp!, {r4, r5} 445 adds r2, r2, #0x0c 446 blt Lmemcpy_bsrcul2l4 447 448Lmemcpy_bsrcul2loop4: 449 mov r12, r3, lsl #16 450 ldr r3, [r1, #-4]! 451 orr r12, r12, r3, lsr #16 452 str r12, [r0, #-4]! 453 subs r2, r2, #4 454 bge Lmemcpy_bsrcul2loop4 455 456Lmemcpy_bsrcul2l4: 457 add r1, r1, #2 458 b Lmemcpy_bl4 459 460Lmemcpy_bsrcul1: 461 cmp r2, #0x0c 462 blt Lmemcpy_bsrcul1loop4 463 sub r2, r2, #0x0c 464 stmdb sp!, {r4, r5} 465 466Lmemcpy_bsrcul1loop32: 467 mov lr, r3, lsl #24 468 ldmdb r1!, {r3-r5, r12} 469 orr lr, lr, r12, lsr #8 470 mov r12, r12, lsl #24 471 orr r12, r12, r5, lsr #8 472 mov r5, r5, lsl #24 473 orr r5, r5, r4, lsr #8 474 mov r4, r4, lsl #24 475 orr r4, r4, r3, lsr #8 476 stmdb r0!, {r4, r5, r12, lr} 477 subs r2, r2, #0x10 478 bge Lmemcpy_bsrcul1loop32 479 ldmia sp!, {r4, r5} 480 adds r2, r2, #0x0c 481 blt Lmemcpy_bsrcul1l4 482 483Lmemcpy_bsrcul1loop4: 484 mov r12, r3, lsl #24 485 ldr r3, [r1, #-4]! 486 orr r12, r12, r3, lsr #8 487 str r12, [r0, #-4]! 488 subs r2, r2, #4 489 bge Lmemcpy_bsrcul1loop4 490 491Lmemcpy_bsrcul1l4: 492 add r1, r1, #1 493 b Lmemcpy_bl4 494 495