1/* $OpenBSD: memcpy.S,v 1.4 2013/06/15 19:16:53 miod Exp $ */ 2/* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <machine/asm.h> 34 35/* 36 * This is one fun bit of code ... 37 * Some easy listening music is suggested while trying to understand this 38 * code e.g. Iron Maiden 39 * 40 * For anyone attempting to understand it : 41 * 42 * The core code is implemented here with simple stubs for memcpy() 43 * memmove() and bcopy(). 44 * 45 * All local labels are prefixed with Lmemcpy_ 46 * Following the prefix a label starting f is used in the forward copy code 47 * while a label using b is used in the backwards copy code 48 * The source and destination addresses determine whether a forward or 49 * backward copy is performed. 50 * Separate bits of code are used to deal with the following situations 51 * for both the forward and backwards copy. 52 * unaligned source address 53 * unaligned destination address 54 * Separate copy routines are used to produce an optimised result for each 55 * of these cases. 56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 57 * a time where possible. 58 * 59 * Note: r12 (aka ip) can be trashed during the function along with 60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 61 * Additional registers are preserved prior to use i.e. r4, r5 & lr 62 * 63 * Apologies for the state of the comments ;-) 64 */ 65 66ENTRY(memcpy) 67ENTRY_NP(memmove) 68 /* Determine copy direction */ 69 cmp r1, r0 70 71#ifdef __APCS_26__ 72 moveqs pc, lr 73#else 74 moveq pc, lr 75#endif 76 77 /* save leaf functions having to store this away */ 78 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 79 80 bcc Lmemcpy_backwards 81 82 /* start of forwards copy */ 83 subs r2, r2, #4 84 blt Lmemcpy_fl4 /* less than 4 bytes */ 85 ands r12, r0, #3 86 bne Lmemcpy_fdestul /* oh unaligned destination addr */ 87 ands r12, r1, #3 88 bne Lmemcpy_fsrcul /* oh unaligned source addr */ 89 90Lmemcpy_ft8: 91 /* We have aligned source and destination */ 92 subs r2, r2, #8 93 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 94 subs r2, r2, #0x14 95 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 96 stmdb sp!, {r4} /* borrow r4 */ 97 98 /* blat 32 bytes at a time */ 99 /* XXX for really big copies perhaps we should use more registers */ 100Lmemcpy_floop32: 101 ldmia r1!, {r3, r4, r12, lr} 102 stmia r0!, {r3, r4, r12, lr} 103 ldmia r1!, {r3, r4, r12, lr} 104 stmia r0!, {r3, r4, r12, lr} 105 subs r2, r2, #0x20 106 bge Lmemcpy_floop32 107 108 cmn r2, #0x10 109 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 110 stmgeia r0!, {r3, r4, r12, lr} 111 subge r2, r2, #0x10 112 ldmia sp!, {r4} /* return r4 */ 113 114Lmemcpy_fl32: 115 adds r2, r2, #0x14 116 117 /* blat 12 bytes at a time */ 118Lmemcpy_floop12: 119 ldmgeia r1!, {r3, r12, lr} 120 stmgeia r0!, {r3, r12, lr} 121 subges r2, r2, #0x0c 122 bge Lmemcpy_floop12 123 124Lmemcpy_fl12: 125 adds r2, r2, #8 126 blt Lmemcpy_fl4 127 128 subs r2, r2, #4 129 ldrlt r3, [r1], #4 130 strlt r3, [r0], #4 131 ldmgeia r1!, {r3, r12} 132 stmgeia r0!, {r3, r12} 133 subge r2, r2, #4 134 135Lmemcpy_fl4: 136 /* less than 4 bytes to go */ 137 adds r2, r2, #4 138#ifdef __APCS_26_ 139 ldmeqia sp!, {r0, pc}^ /* done */ 140#else 141 ldmeqia sp!, {r0, pc} /* done */ 142#endif 143 /* copy the crud byte at a time */ 144 cmp r2, #2 145 ldrb r3, [r1], #1 146 strb r3, [r0], #1 147 ldrgeb r3, [r1], #1 148 strgeb r3, [r0], #1 149 ldrgtb r3, [r1], #1 150 strgtb r3, [r0], #1 151#ifdef __APCS_26__ 152 ldmia sp!, {r0, pc}^ 153#else 154 ldmia sp!, {r0, pc} 155#endif 156 157 /* erg - unaligned destination */ 158Lmemcpy_fdestul: 159 rsb r12, r12, #4 160 cmp r12, #2 161 162 /* align destination with byte copies */ 163 ldrb r3, [r1], #1 164 strb r3, [r0], #1 165 ldrgeb r3, [r1], #1 166 strgeb r3, [r0], #1 167 ldrgtb r3, [r1], #1 168 strgtb r3, [r0], #1 169 subs r2, r2, r12 170 blt Lmemcpy_fl4 /* less the 4 bytes */ 171 172 ands r12, r1, #3 173 beq Lmemcpy_ft8 /* we have an aligned source */ 174 175 /* erg - unaligned source */ 176 /* This is where it gets nasty ... */ 177Lmemcpy_fsrcul: 178 bic r1, r1, #3 179 ldr lr, [r1], #4 180 cmp r12, #2 181 bgt Lmemcpy_fsrcul3 182 beq Lmemcpy_fsrcul2 183 cmp r2, #0x0c 184 blt Lmemcpy_fsrcul1loop4 185 sub r2, r2, #0x0c 186 stmdb sp!, {r4, r5} 187 188Lmemcpy_fsrcul1loop16: 189 mov r3, lr, lsr #8 190 ldmia r1!, {r4, r5, r12, lr} 191 orr r3, r3, r4, lsl #24 192 mov r4, r4, lsr #8 193 orr r4, r4, r5, lsl #24 194 mov r5, r5, lsr #8 195 orr r5, r5, r12, lsl #24 196 mov r12, r12, lsr #8 197 orr r12, r12, lr, lsl #24 198 stmia r0!, {r3-r5, r12} 199 subs r2, r2, #0x10 200 bge Lmemcpy_fsrcul1loop16 201 ldmia sp!, {r4, r5} 202 adds r2, r2, #0x0c 203 blt Lmemcpy_fsrcul1l4 204 205Lmemcpy_fsrcul1loop4: 206 mov r12, lr, lsr #8 207 ldr lr, [r1], #4 208 orr r12, r12, lr, lsl #24 209 str r12, [r0], #4 210 subs r2, r2, #4 211 bge Lmemcpy_fsrcul1loop4 212 213Lmemcpy_fsrcul1l4: 214 sub r1, r1, #3 215 b Lmemcpy_fl4 216 217Lmemcpy_fsrcul2: 218 cmp r2, #0x0c 219 blt Lmemcpy_fsrcul2loop4 220 sub r2, r2, #0x0c 221 stmdb sp!, {r4, r5} 222 223Lmemcpy_fsrcul2loop16: 224 mov r3, lr, lsr #16 225 ldmia r1!, {r4, r5, r12, lr} 226 orr r3, r3, r4, lsl #16 227 mov r4, r4, lsr #16 228 orr r4, r4, r5, lsl #16 229 mov r5, r5, lsr #16 230 orr r5, r5, r12, lsl #16 231 mov r12, r12, lsr #16 232 orr r12, r12, lr, lsl #16 233 stmia r0!, {r3-r5, r12} 234 subs r2, r2, #0x10 235 bge Lmemcpy_fsrcul2loop16 236 ldmia sp!, {r4, r5} 237 adds r2, r2, #0x0c 238 blt Lmemcpy_fsrcul2l4 239 240Lmemcpy_fsrcul2loop4: 241 mov r12, lr, lsr #16 242 ldr lr, [r1], #4 243 orr r12, r12, lr, lsl #16 244 str r12, [r0], #4 245 subs r2, r2, #4 246 bge Lmemcpy_fsrcul2loop4 247 248Lmemcpy_fsrcul2l4: 249 sub r1, r1, #2 250 b Lmemcpy_fl4 251 252Lmemcpy_fsrcul3: 253 cmp r2, #0x0c 254 blt Lmemcpy_fsrcul3loop4 255 sub r2, r2, #0x0c 256 stmdb sp!, {r4, r5} 257 258Lmemcpy_fsrcul3loop16: 259 mov r3, lr, lsr #24 260 ldmia r1!, {r4, r5, r12, lr} 261 orr r3, r3, r4, lsl #8 262 mov r4, r4, lsr #24 263 orr r4, r4, r5, lsl #8 264 mov r5, r5, lsr #24 265 orr r5, r5, r12, lsl #8 266 mov r12, r12, lsr #24 267 orr r12, r12, lr, lsl #8 268 stmia r0!, {r3-r5, r12} 269 subs r2, r2, #0x10 270 bge Lmemcpy_fsrcul3loop16 271 ldmia sp!, {r4, r5} 272 adds r2, r2, #0x0c 273 blt Lmemcpy_fsrcul3l4 274 275Lmemcpy_fsrcul3loop4: 276 mov r12, lr, lsr #24 277 ldr lr, [r1], #4 278 orr r12, r12, lr, lsl #8 279 str r12, [r0], #4 280 subs r2, r2, #4 281 bge Lmemcpy_fsrcul3loop4 282 283Lmemcpy_fsrcul3l4: 284 sub r1, r1, #1 285 b Lmemcpy_fl4 286 287Lmemcpy_backwards: 288 add r1, r1, r2 289 add r0, r0, r2 290 subs r2, r2, #4 291 blt Lmemcpy_bl4 /* less than 4 bytes */ 292 ands r12, r0, #3 293 bne Lmemcpy_bdestul /* oh unaligned destination addr */ 294 ands r12, r1, #3 295 bne Lmemcpy_bsrcul /* oh unaligned source addr */ 296 297Lmemcpy_bt8: 298 /* We have aligned source and destination */ 299 subs r2, r2, #8 300 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 301 stmdb sp!, {r4} 302 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 303 blt Lmemcpy_bl32 304 305 /* blat 32 bytes at a time */ 306 /* XXX for really big copies perhaps we should use more registers */ 307Lmemcpy_bloop32: 308 ldmdb r1!, {r3, r4, r12, lr} 309 stmdb r0!, {r3, r4, r12, lr} 310 ldmdb r1!, {r3, r4, r12, lr} 311 stmdb r0!, {r3, r4, r12, lr} 312 subs r2, r2, #0x20 313 bge Lmemcpy_bloop32 314 315Lmemcpy_bl32: 316 cmn r2, #0x10 317 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 318 stmgedb r0!, {r3, r4, r12, lr} 319 subge r2, r2, #0x10 320 adds r2, r2, #0x14 321 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 322 stmgedb r0!, {r3, r12, lr} 323 subge r2, r2, #0x0c 324 ldmia sp!, {r4} 325 326Lmemcpy_bl12: 327 adds r2, r2, #8 328 blt Lmemcpy_bl4 329 subs r2, r2, #4 330 ldrlt r3, [r1, #-4]! 331 strlt r3, [r0, #-4]! 332 ldmgedb r1!, {r3, r12} 333 stmgedb r0!, {r3, r12} 334 subge r2, r2, #4 335 336Lmemcpy_bl4: 337 /* less than 4 bytes to go */ 338 adds r2, r2, #4 339#ifdef __APCS_26__ 340 ldmeqia sp!, {r0, pc}^ 341#else 342 ldmeqia sp!, {r0, pc} 343#endif 344 345 /* copy the crud byte at a time */ 346 cmp r2, #2 347 ldrb r3, [r1, #-1]! 348 strb r3, [r0, #-1]! 349 ldrgeb r3, [r1, #-1]! 350 strgeb r3, [r0, #-1]! 351 ldrgtb r3, [r1, #-1]! 352 strgtb r3, [r0, #-1]! 353#ifdef __APCS_26__ 354 ldmia sp!, {r0, pc}^ 355#else 356 ldmia sp!, {r0, pc} 357#endif 358 359 /* erg - unaligned destination */ 360Lmemcpy_bdestul: 361 cmp r12, #2 362 363 /* align destination with byte copies */ 364 ldrb r3, [r1, #-1]! 365 strb r3, [r0, #-1]! 366 ldrgeb r3, [r1, #-1]! 367 strgeb r3, [r0, #-1]! 368 ldrgtb r3, [r1, #-1]! 369 strgtb r3, [r0, #-1]! 370 subs r2, r2, r12 371 blt Lmemcpy_bl4 /* less than 4 bytes to go */ 372 ands r12, r1, #3 373 beq Lmemcpy_bt8 /* we have an aligned source */ 374 375 /* erg - unaligned source */ 376 /* This is where it gets nasty ... */ 377Lmemcpy_bsrcul: 378 bic r1, r1, #3 379 ldr r3, [r1, #0] 380 cmp r12, #2 381 blt Lmemcpy_bsrcul1 382 beq Lmemcpy_bsrcul2 383 cmp r2, #0x0c 384 blt Lmemcpy_bsrcul3loop4 385 sub r2, r2, #0x0c 386 stmdb sp!, {r4, r5} 387 388Lmemcpy_bsrcul3loop16: 389 mov lr, r3, lsl #8 390 ldmdb r1!, {r3-r5, r12} 391 orr lr, lr, r12, lsr #24 392 mov r12, r12, lsl #8 393 orr r12, r12, r5, lsr #24 394 mov r5, r5, lsl #8 395 orr r5, r5, r4, lsr #24 396 mov r4, r4, lsl #8 397 orr r4, r4, r3, lsr #24 398 stmdb r0!, {r4, r5, r12, lr} 399 subs r2, r2, #0x10 400 bge Lmemcpy_bsrcul3loop16 401 ldmia sp!, {r4, r5} 402 adds r2, r2, #0x0c 403 blt Lmemcpy_bsrcul3l4 404 405Lmemcpy_bsrcul3loop4: 406 mov r12, r3, lsl #8 407 ldr r3, [r1, #-4]! 408 orr r12, r12, r3, lsr #24 409 str r12, [r0, #-4]! 410 subs r2, r2, #4 411 bge Lmemcpy_bsrcul3loop4 412 413Lmemcpy_bsrcul3l4: 414 add r1, r1, #3 415 b Lmemcpy_bl4 416 417Lmemcpy_bsrcul2: 418 cmp r2, #0x0c 419 blt Lmemcpy_bsrcul2loop4 420 sub r2, r2, #0x0c 421 stmdb sp!, {r4, r5} 422 423Lmemcpy_bsrcul2loop16: 424 mov lr, r3, lsl #16 425 ldmdb r1!, {r3-r5, r12} 426 orr lr, lr, r12, lsr #16 427 mov r12, r12, lsl #16 428 orr r12, r12, r5, lsr #16 429 mov r5, r5, lsl #16 430 orr r5, r5, r4, lsr #16 431 mov r4, r4, lsl #16 432 orr r4, r4, r3, lsr #16 433 stmdb r0!, {r4, r5, r12, lr} 434 subs r2, r2, #0x10 435 bge Lmemcpy_bsrcul2loop16 436 ldmia sp!, {r4, r5} 437 adds r2, r2, #0x0c 438 blt Lmemcpy_bsrcul2l4 439 440Lmemcpy_bsrcul2loop4: 441 mov r12, r3, lsl #16 442 ldr r3, [r1, #-4]! 443 orr r12, r12, r3, lsr #16 444 str r12, [r0, #-4]! 445 subs r2, r2, #4 446 bge Lmemcpy_bsrcul2loop4 447 448Lmemcpy_bsrcul2l4: 449 add r1, r1, #2 450 b Lmemcpy_bl4 451 452Lmemcpy_bsrcul1: 453 cmp r2, #0x0c 454 blt Lmemcpy_bsrcul1loop4 455 sub r2, r2, #0x0c 456 stmdb sp!, {r4, r5} 457 458Lmemcpy_bsrcul1loop32: 459 mov lr, r3, lsl #24 460 ldmdb r1!, {r3-r5, r12} 461 orr lr, lr, r12, lsr #8 462 mov r12, r12, lsl #24 463 orr r12, r12, r5, lsr #8 464 mov r5, r5, lsl #24 465 orr r5, r5, r4, lsr #8 466 mov r4, r4, lsl #24 467 orr r4, r4, r3, lsr #8 468 stmdb r0!, {r4, r5, r12, lr} 469 subs r2, r2, #0x10 470 bge Lmemcpy_bsrcul1loop32 471 ldmia sp!, {r4, r5} 472 adds r2, r2, #0x0c 473 blt Lmemcpy_bsrcul1l4 474 475Lmemcpy_bsrcul1loop4: 476 mov r12, r3, lsl #24 477 ldr r3, [r1, #-4]! 478 orr r12, r12, r3, lsr #8 479 str r12, [r0, #-4]! 480 subs r2, r2, #4 481 bge Lmemcpy_bsrcul1loop4 482 483Lmemcpy_bsrcul1l4: 484 add r1, r1, #1 485 b Lmemcpy_bl4 486 487