1/* $OpenBSD: memcpy.S,v 1.3 2008/06/26 05:42:20 ray Exp $ */ 2/* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <machine/asm.h> 34 35/* 36 * This is one fun bit of code ... 37 * Some easy listening music is suggested while trying to understand this 38 * code e.g. Iron Maiden 39 * 40 * For anyone attempting to understand it : 41 * 42 * The core code is implemented here with simple stubs for memcpy() 43 * memmove() and bcopy(). 44 * 45 * All local labels are prefixed with Lmemcpy_ 46 * Following the prefix a label starting f is used in the forward copy code 47 * while a label using b is used in the backwards copy code 48 * The source and destination addresses determine whether a forward or 49 * backward copy is performed. 50 * Separate bits of code are used to deal with the following situations 51 * for both the forward and backwards copy. 52 * unaligned source address 53 * unaligned destination address 54 * Separate copy routines are used to produce an optimised result for each 55 * of these cases. 56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 57 * a time where possible. 58 * 59 * Note: r12 (aka ip) can be trashed during the function along with 60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 61 * Additional registers are preserved prior to use i.e. r4, r5 & lr 62 * 63 * Apologies for the state of the comments ;-) 64 */ 65 66ENTRY(memcpy) 67ENTRY_NP(memmove) 68 /* Determine copy direction */ 69 cmp r1, r0 70 71 moveq r0, #0 /* Quick abort for len=0 */ 72#ifdef __APCS_26__ 73 moveqs pc, lr 74#else 75 moveq pc, lr 76#endif 77 78 /* save leaf functions having to store this away */ 79 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 80 81 bcc Lmemcpy_backwards 82 83 /* start of forwards copy */ 84 subs r2, r2, #4 85 blt Lmemcpy_fl4 /* less than 4 bytes */ 86 ands r12, r0, #3 87 bne Lmemcpy_fdestul /* oh unaligned destination addr */ 88 ands r12, r1, #3 89 bne Lmemcpy_fsrcul /* oh unaligned source addr */ 90 91Lmemcpy_ft8: 92 /* We have aligned source and destination */ 93 subs r2, r2, #8 94 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 95 subs r2, r2, #0x14 96 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 97 stmdb sp!, {r4} /* borrow r4 */ 98 99 /* blat 32 bytes at a time */ 100 /* XXX for really big copies perhaps we should use more registers */ 101Lmemcpy_floop32: 102 ldmia r1!, {r3, r4, r12, lr} 103 stmia r0!, {r3, r4, r12, lr} 104 ldmia r1!, {r3, r4, r12, lr} 105 stmia r0!, {r3, r4, r12, lr} 106 subs r2, r2, #0x20 107 bge Lmemcpy_floop32 108 109 cmn r2, #0x10 110 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 111 stmgeia r0!, {r3, r4, r12, lr} 112 subge r2, r2, #0x10 113 ldmia sp!, {r4} /* return r4 */ 114 115Lmemcpy_fl32: 116 adds r2, r2, #0x14 117 118 /* blat 12 bytes at a time */ 119Lmemcpy_floop12: 120 ldmgeia r1!, {r3, r12, lr} 121 stmgeia r0!, {r3, r12, lr} 122 subges r2, r2, #0x0c 123 bge Lmemcpy_floop12 124 125Lmemcpy_fl12: 126 adds r2, r2, #8 127 blt Lmemcpy_fl4 128 129 subs r2, r2, #4 130 ldrlt r3, [r1], #4 131 strlt r3, [r0], #4 132 ldmgeia r1!, {r3, r12} 133 stmgeia r0!, {r3, r12} 134 subge r2, r2, #4 135 136Lmemcpy_fl4: 137 /* less than 4 bytes to go */ 138 adds r2, r2, #4 139#ifdef __APCS_26_ 140 ldmeqia sp!, {r0, pc}^ /* done */ 141#else 142 ldmeqia sp!, {r0, pc} /* done */ 143#endif 144 /* copy the crud byte at a time */ 145 cmp r2, #2 146 ldrb r3, [r1], #1 147 strb r3, [r0], #1 148 ldrgeb r3, [r1], #1 149 strgeb r3, [r0], #1 150 ldrgtb r3, [r1], #1 151 strgtb r3, [r0], #1 152#ifdef __APCS_26__ 153 ldmia sp!, {r0, pc}^ 154#else 155 ldmia sp!, {r0, pc} 156#endif 157 158 /* erg - unaligned destination */ 159Lmemcpy_fdestul: 160 rsb r12, r12, #4 161 cmp r12, #2 162 163 /* align destination with byte copies */ 164 ldrb r3, [r1], #1 165 strb r3, [r0], #1 166 ldrgeb r3, [r1], #1 167 strgeb r3, [r0], #1 168 ldrgtb r3, [r1], #1 169 strgtb r3, [r0], #1 170 subs r2, r2, r12 171 blt Lmemcpy_fl4 /* less the 4 bytes */ 172 173 ands r12, r1, #3 174 beq Lmemcpy_ft8 /* we have an aligned source */ 175 176 /* erg - unaligned source */ 177 /* This is where it gets nasty ... */ 178Lmemcpy_fsrcul: 179 bic r1, r1, #3 180 ldr lr, [r1], #4 181 cmp r12, #2 182 bgt Lmemcpy_fsrcul3 183 beq Lmemcpy_fsrcul2 184 cmp r2, #0x0c 185 blt Lmemcpy_fsrcul1loop4 186 sub r2, r2, #0x0c 187 stmdb sp!, {r4, r5} 188 189Lmemcpy_fsrcul1loop16: 190 mov r3, lr, lsr #8 191 ldmia r1!, {r4, r5, r12, lr} 192 orr r3, r3, r4, lsl #24 193 mov r4, r4, lsr #8 194 orr r4, r4, r5, lsl #24 195 mov r5, r5, lsr #8 196 orr r5, r5, r12, lsl #24 197 mov r12, r12, lsr #8 198 orr r12, r12, lr, lsl #24 199 stmia r0!, {r3-r5, r12} 200 subs r2, r2, #0x10 201 bge Lmemcpy_fsrcul1loop16 202 ldmia sp!, {r4, r5} 203 adds r2, r2, #0x0c 204 blt Lmemcpy_fsrcul1l4 205 206Lmemcpy_fsrcul1loop4: 207 mov r12, lr, lsr #8 208 ldr lr, [r1], #4 209 orr r12, r12, lr, lsl #24 210 str r12, [r0], #4 211 subs r2, r2, #4 212 bge Lmemcpy_fsrcul1loop4 213 214Lmemcpy_fsrcul1l4: 215 sub r1, r1, #3 216 b Lmemcpy_fl4 217 218Lmemcpy_fsrcul2: 219 cmp r2, #0x0c 220 blt Lmemcpy_fsrcul2loop4 221 sub r2, r2, #0x0c 222 stmdb sp!, {r4, r5} 223 224Lmemcpy_fsrcul2loop16: 225 mov r3, lr, lsr #16 226 ldmia r1!, {r4, r5, r12, lr} 227 orr r3, r3, r4, lsl #16 228 mov r4, r4, lsr #16 229 orr r4, r4, r5, lsl #16 230 mov r5, r5, lsr #16 231 orr r5, r5, r12, lsl #16 232 mov r12, r12, lsr #16 233 orr r12, r12, lr, lsl #16 234 stmia r0!, {r3-r5, r12} 235 subs r2, r2, #0x10 236 bge Lmemcpy_fsrcul2loop16 237 ldmia sp!, {r4, r5} 238 adds r2, r2, #0x0c 239 blt Lmemcpy_fsrcul2l4 240 241Lmemcpy_fsrcul2loop4: 242 mov r12, lr, lsr #16 243 ldr lr, [r1], #4 244 orr r12, r12, lr, lsl #16 245 str r12, [r0], #4 246 subs r2, r2, #4 247 bge Lmemcpy_fsrcul2loop4 248 249Lmemcpy_fsrcul2l4: 250 sub r1, r1, #2 251 b Lmemcpy_fl4 252 253Lmemcpy_fsrcul3: 254 cmp r2, #0x0c 255 blt Lmemcpy_fsrcul3loop4 256 sub r2, r2, #0x0c 257 stmdb sp!, {r4, r5} 258 259Lmemcpy_fsrcul3loop16: 260 mov r3, lr, lsr #24 261 ldmia r1!, {r4, r5, r12, lr} 262 orr r3, r3, r4, lsl #8 263 mov r4, r4, lsr #24 264 orr r4, r4, r5, lsl #8 265 mov r5, r5, lsr #24 266 orr r5, r5, r12, lsl #8 267 mov r12, r12, lsr #24 268 orr r12, r12, lr, lsl #8 269 stmia r0!, {r3-r5, r12} 270 subs r2, r2, #0x10 271 bge Lmemcpy_fsrcul3loop16 272 ldmia sp!, {r4, r5} 273 adds r2, r2, #0x0c 274 blt Lmemcpy_fsrcul3l4 275 276Lmemcpy_fsrcul3loop4: 277 mov r12, lr, lsr #24 278 ldr lr, [r1], #4 279 orr r12, r12, lr, lsl #8 280 str r12, [r0], #4 281 subs r2, r2, #4 282 bge Lmemcpy_fsrcul3loop4 283 284Lmemcpy_fsrcul3l4: 285 sub r1, r1, #1 286 b Lmemcpy_fl4 287 288Lmemcpy_backwards: 289 add r1, r1, r2 290 add r0, r0, r2 291 subs r2, r2, #4 292 blt Lmemcpy_bl4 /* less than 4 bytes */ 293 ands r12, r0, #3 294 bne Lmemcpy_bdestul /* oh unaligned destination addr */ 295 ands r12, r1, #3 296 bne Lmemcpy_bsrcul /* oh unaligned source addr */ 297 298Lmemcpy_bt8: 299 /* We have aligned source and destination */ 300 subs r2, r2, #8 301 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 302 stmdb sp!, {r4} 303 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 304 blt Lmemcpy_bl32 305 306 /* blat 32 bytes at a time */ 307 /* XXX for really big copies perhaps we should use more registers */ 308Lmemcpy_bloop32: 309 ldmdb r1!, {r3, r4, r12, lr} 310 stmdb r0!, {r3, r4, r12, lr} 311 ldmdb r1!, {r3, r4, r12, lr} 312 stmdb r0!, {r3, r4, r12, lr} 313 subs r2, r2, #0x20 314 bge Lmemcpy_bloop32 315 316Lmemcpy_bl32: 317 cmn r2, #0x10 318 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 319 stmgedb r0!, {r3, r4, r12, lr} 320 subge r2, r2, #0x10 321 adds r2, r2, #0x14 322 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 323 stmgedb r0!, {r3, r12, lr} 324 subge r2, r2, #0x0c 325 ldmia sp!, {r4} 326 327Lmemcpy_bl12: 328 adds r2, r2, #8 329 blt Lmemcpy_bl4 330 subs r2, r2, #4 331 ldrlt r3, [r1, #-4]! 332 strlt r3, [r0, #-4]! 333 ldmgedb r1!, {r3, r12} 334 stmgedb r0!, {r3, r12} 335 subge r2, r2, #4 336 337Lmemcpy_bl4: 338 /* less than 4 bytes to go */ 339 adds r2, r2, #4 340#ifdef __APCS_26__ 341 ldmeqia sp!, {r0, pc}^ 342#else 343 ldmeqia sp!, {r0, pc} 344#endif 345 346 /* copy the crud byte at a time */ 347 cmp r2, #2 348 ldrb r3, [r1, #-1]! 349 strb r3, [r0, #-1]! 350 ldrgeb r3, [r1, #-1]! 351 strgeb r3, [r0, #-1]! 352 ldrgtb r3, [r1, #-1]! 353 strgtb r3, [r0, #-1]! 354#ifdef __APCS_26__ 355 ldmia sp!, {r0, pc}^ 356#else 357 ldmia sp!, {r0, pc} 358#endif 359 360 /* erg - unaligned destination */ 361Lmemcpy_bdestul: 362 cmp r12, #2 363 364 /* align destination with byte copies */ 365 ldrb r3, [r1, #-1]! 366 strb r3, [r0, #-1]! 367 ldrgeb r3, [r1, #-1]! 368 strgeb r3, [r0, #-1]! 369 ldrgtb r3, [r1, #-1]! 370 strgtb r3, [r0, #-1]! 371 subs r2, r2, r12 372 blt Lmemcpy_bl4 /* less than 4 bytes to go */ 373 ands r12, r1, #3 374 beq Lmemcpy_bt8 /* we have an aligned source */ 375 376 /* erg - unaligned source */ 377 /* This is where it gets nasty ... */ 378Lmemcpy_bsrcul: 379 bic r1, r1, #3 380 ldr r3, [r1, #0] 381 cmp r12, #2 382 blt Lmemcpy_bsrcul1 383 beq Lmemcpy_bsrcul2 384 cmp r2, #0x0c 385 blt Lmemcpy_bsrcul3loop4 386 sub r2, r2, #0x0c 387 stmdb sp!, {r4, r5} 388 389Lmemcpy_bsrcul3loop16: 390 mov lr, r3, lsl #8 391 ldmdb r1!, {r3-r5, r12} 392 orr lr, lr, r12, lsr #24 393 mov r12, r12, lsl #8 394 orr r12, r12, r5, lsr #24 395 mov r5, r5, lsl #8 396 orr r5, r5, r4, lsr #24 397 mov r4, r4, lsl #8 398 orr r4, r4, r3, lsr #24 399 stmdb r0!, {r4, r5, r12, lr} 400 subs r2, r2, #0x10 401 bge Lmemcpy_bsrcul3loop16 402 ldmia sp!, {r4, r5} 403 adds r2, r2, #0x0c 404 blt Lmemcpy_bsrcul3l4 405 406Lmemcpy_bsrcul3loop4: 407 mov r12, r3, lsl #8 408 ldr r3, [r1, #-4]! 409 orr r12, r12, r3, lsr #24 410 str r12, [r0, #-4]! 411 subs r2, r2, #4 412 bge Lmemcpy_bsrcul3loop4 413 414Lmemcpy_bsrcul3l4: 415 add r1, r1, #3 416 b Lmemcpy_bl4 417 418Lmemcpy_bsrcul2: 419 cmp r2, #0x0c 420 blt Lmemcpy_bsrcul2loop4 421 sub r2, r2, #0x0c 422 stmdb sp!, {r4, r5} 423 424Lmemcpy_bsrcul2loop16: 425 mov lr, r3, lsl #16 426 ldmdb r1!, {r3-r5, r12} 427 orr lr, lr, r12, lsr #16 428 mov r12, r12, lsl #16 429 orr r12, r12, r5, lsr #16 430 mov r5, r5, lsl #16 431 orr r5, r5, r4, lsr #16 432 mov r4, r4, lsl #16 433 orr r4, r4, r3, lsr #16 434 stmdb r0!, {r4, r5, r12, lr} 435 subs r2, r2, #0x10 436 bge Lmemcpy_bsrcul2loop16 437 ldmia sp!, {r4, r5} 438 adds r2, r2, #0x0c 439 blt Lmemcpy_bsrcul2l4 440 441Lmemcpy_bsrcul2loop4: 442 mov r12, r3, lsl #16 443 ldr r3, [r1, #-4]! 444 orr r12, r12, r3, lsr #16 445 str r12, [r0, #-4]! 446 subs r2, r2, #4 447 bge Lmemcpy_bsrcul2loop4 448 449Lmemcpy_bsrcul2l4: 450 add r1, r1, #2 451 b Lmemcpy_bl4 452 453Lmemcpy_bsrcul1: 454 cmp r2, #0x0c 455 blt Lmemcpy_bsrcul1loop4 456 sub r2, r2, #0x0c 457 stmdb sp!, {r4, r5} 458 459Lmemcpy_bsrcul1loop32: 460 mov lr, r3, lsl #24 461 ldmdb r1!, {r3-r5, r12} 462 orr lr, lr, r12, lsr #8 463 mov r12, r12, lsl #24 464 orr r12, r12, r5, lsr #8 465 mov r5, r5, lsl #24 466 orr r5, r5, r4, lsr #8 467 mov r4, r4, lsl #24 468 orr r4, r4, r3, lsr #8 469 stmdb r0!, {r4, r5, r12, lr} 470 subs r2, r2, #0x10 471 bge Lmemcpy_bsrcul1loop32 472 ldmia sp!, {r4, r5} 473 adds r2, r2, #0x0c 474 blt Lmemcpy_bsrcul1l4 475 476Lmemcpy_bsrcul1loop4: 477 mov r12, r3, lsl #24 478 ldr r3, [r1, #-4]! 479 orr r12, r12, r3, lsr #8 480 str r12, [r0, #-4]! 481 subs r2, r2, #4 482 bge Lmemcpy_bsrcul1loop4 483 484Lmemcpy_bsrcul1l4: 485 add r1, r1, #1 486 b Lmemcpy_bl4 487 488