1/* $OpenBSD: memcpy.S,v 1.5 2014/12/30 08:12:52 jsg Exp $ */ 2/* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <machine/asm.h> 34 35/* 36 * This is one fun bit of code ... 37 * Some easy listening music is suggested while trying to understand this 38 * code e.g. Iron Maiden 39 * 40 * For anyone attempting to understand it : 41 * 42 * The core code is implemented here with simple stubs for memcpy() 43 * memmove() and bcopy(). 44 * 45 * All local labels are prefixed with Lmemcpy_ 46 * Following the prefix a label starting f is used in the forward copy code 47 * while a label using b is used in the backwards copy code 48 * The source and destination addresses determine whether a forward or 49 * backward copy is performed. 50 * Separate bits of code are used to deal with the following situations 51 * for both the forward and backwards copy. 52 * unaligned source address 53 * unaligned destination address 54 * Separate copy routines are used to produce an optimised result for each 55 * of these cases. 56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 57 * a time where possible. 58 * 59 * Note: r12 (aka ip) can be trashed during the function along with 60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 61 * Additional registers are preserved prior to use i.e. r4, r5 & lr 62 * 63 * Apologies for the state of the comments ;-) 64 */ 65 66ENTRY(memcpy) 67ENTRY_NP(memmove) 68 /* Determine copy direction */ 69 cmp r1, r0 70 71 moveq pc, lr 72 73 /* save leaf functions having to store this away */ 74 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 75 76 bcc Lmemcpy_backwards 77 78 /* start of forwards copy */ 79 subs r2, r2, #4 80 blt Lmemcpy_fl4 /* less than 4 bytes */ 81 ands r12, r0, #3 82 bne Lmemcpy_fdestul /* oh unaligned destination addr */ 83 ands r12, r1, #3 84 bne Lmemcpy_fsrcul /* oh unaligned source addr */ 85 86Lmemcpy_ft8: 87 /* We have aligned source and destination */ 88 subs r2, r2, #8 89 blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 90 subs r2, r2, #0x14 91 blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 92 stmdb sp!, {r4} /* borrow r4 */ 93 94 /* blat 32 bytes at a time */ 95 /* XXX for really big copies perhaps we should use more registers */ 96Lmemcpy_floop32: 97 ldmia r1!, {r3, r4, r12, lr} 98 stmia r0!, {r3, r4, r12, lr} 99 ldmia r1!, {r3, r4, r12, lr} 100 stmia r0!, {r3, r4, r12, lr} 101 subs r2, r2, #0x20 102 bge Lmemcpy_floop32 103 104 cmn r2, #0x10 105 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 106 stmgeia r0!, {r3, r4, r12, lr} 107 subge r2, r2, #0x10 108 ldmia sp!, {r4} /* return r4 */ 109 110Lmemcpy_fl32: 111 adds r2, r2, #0x14 112 113 /* blat 12 bytes at a time */ 114Lmemcpy_floop12: 115 ldmgeia r1!, {r3, r12, lr} 116 stmgeia r0!, {r3, r12, lr} 117 subges r2, r2, #0x0c 118 bge Lmemcpy_floop12 119 120Lmemcpy_fl12: 121 adds r2, r2, #8 122 blt Lmemcpy_fl4 123 124 subs r2, r2, #4 125 ldrlt r3, [r1], #4 126 strlt r3, [r0], #4 127 ldmgeia r1!, {r3, r12} 128 stmgeia r0!, {r3, r12} 129 subge r2, r2, #4 130 131Lmemcpy_fl4: 132 /* less than 4 bytes to go */ 133 adds r2, r2, #4 134#ifdef __APCS_26_ 135 ldmeqia sp!, {r0, pc}^ /* done */ 136#else 137 ldmeqia sp!, {r0, pc} /* done */ 138#endif 139 /* copy the crud byte at a time */ 140 cmp r2, #2 141 ldrb r3, [r1], #1 142 strb r3, [r0], #1 143 ldrgeb r3, [r1], #1 144 strgeb r3, [r0], #1 145 ldrgtb r3, [r1], #1 146 strgtb r3, [r0], #1 147 ldmia sp!, {r0, pc} 148 149 /* erg - unaligned destination */ 150Lmemcpy_fdestul: 151 rsb r12, r12, #4 152 cmp r12, #2 153 154 /* align destination with byte copies */ 155 ldrb r3, [r1], #1 156 strb r3, [r0], #1 157 ldrgeb r3, [r1], #1 158 strgeb r3, [r0], #1 159 ldrgtb r3, [r1], #1 160 strgtb r3, [r0], #1 161 subs r2, r2, r12 162 blt Lmemcpy_fl4 /* less the 4 bytes */ 163 164 ands r12, r1, #3 165 beq Lmemcpy_ft8 /* we have an aligned source */ 166 167 /* erg - unaligned source */ 168 /* This is where it gets nasty ... */ 169Lmemcpy_fsrcul: 170 bic r1, r1, #3 171 ldr lr, [r1], #4 172 cmp r12, #2 173 bgt Lmemcpy_fsrcul3 174 beq Lmemcpy_fsrcul2 175 cmp r2, #0x0c 176 blt Lmemcpy_fsrcul1loop4 177 sub r2, r2, #0x0c 178 stmdb sp!, {r4, r5} 179 180Lmemcpy_fsrcul1loop16: 181 mov r3, lr, lsr #8 182 ldmia r1!, {r4, r5, r12, lr} 183 orr r3, r3, r4, lsl #24 184 mov r4, r4, lsr #8 185 orr r4, r4, r5, lsl #24 186 mov r5, r5, lsr #8 187 orr r5, r5, r12, lsl #24 188 mov r12, r12, lsr #8 189 orr r12, r12, lr, lsl #24 190 stmia r0!, {r3-r5, r12} 191 subs r2, r2, #0x10 192 bge Lmemcpy_fsrcul1loop16 193 ldmia sp!, {r4, r5} 194 adds r2, r2, #0x0c 195 blt Lmemcpy_fsrcul1l4 196 197Lmemcpy_fsrcul1loop4: 198 mov r12, lr, lsr #8 199 ldr lr, [r1], #4 200 orr r12, r12, lr, lsl #24 201 str r12, [r0], #4 202 subs r2, r2, #4 203 bge Lmemcpy_fsrcul1loop4 204 205Lmemcpy_fsrcul1l4: 206 sub r1, r1, #3 207 b Lmemcpy_fl4 208 209Lmemcpy_fsrcul2: 210 cmp r2, #0x0c 211 blt Lmemcpy_fsrcul2loop4 212 sub r2, r2, #0x0c 213 stmdb sp!, {r4, r5} 214 215Lmemcpy_fsrcul2loop16: 216 mov r3, lr, lsr #16 217 ldmia r1!, {r4, r5, r12, lr} 218 orr r3, r3, r4, lsl #16 219 mov r4, r4, lsr #16 220 orr r4, r4, r5, lsl #16 221 mov r5, r5, lsr #16 222 orr r5, r5, r12, lsl #16 223 mov r12, r12, lsr #16 224 orr r12, r12, lr, lsl #16 225 stmia r0!, {r3-r5, r12} 226 subs r2, r2, #0x10 227 bge Lmemcpy_fsrcul2loop16 228 ldmia sp!, {r4, r5} 229 adds r2, r2, #0x0c 230 blt Lmemcpy_fsrcul2l4 231 232Lmemcpy_fsrcul2loop4: 233 mov r12, lr, lsr #16 234 ldr lr, [r1], #4 235 orr r12, r12, lr, lsl #16 236 str r12, [r0], #4 237 subs r2, r2, #4 238 bge Lmemcpy_fsrcul2loop4 239 240Lmemcpy_fsrcul2l4: 241 sub r1, r1, #2 242 b Lmemcpy_fl4 243 244Lmemcpy_fsrcul3: 245 cmp r2, #0x0c 246 blt Lmemcpy_fsrcul3loop4 247 sub r2, r2, #0x0c 248 stmdb sp!, {r4, r5} 249 250Lmemcpy_fsrcul3loop16: 251 mov r3, lr, lsr #24 252 ldmia r1!, {r4, r5, r12, lr} 253 orr r3, r3, r4, lsl #8 254 mov r4, r4, lsr #24 255 orr r4, r4, r5, lsl #8 256 mov r5, r5, lsr #24 257 orr r5, r5, r12, lsl #8 258 mov r12, r12, lsr #24 259 orr r12, r12, lr, lsl #8 260 stmia r0!, {r3-r5, r12} 261 subs r2, r2, #0x10 262 bge Lmemcpy_fsrcul3loop16 263 ldmia sp!, {r4, r5} 264 adds r2, r2, #0x0c 265 blt Lmemcpy_fsrcul3l4 266 267Lmemcpy_fsrcul3loop4: 268 mov r12, lr, lsr #24 269 ldr lr, [r1], #4 270 orr r12, r12, lr, lsl #8 271 str r12, [r0], #4 272 subs r2, r2, #4 273 bge Lmemcpy_fsrcul3loop4 274 275Lmemcpy_fsrcul3l4: 276 sub r1, r1, #1 277 b Lmemcpy_fl4 278 279Lmemcpy_backwards: 280 add r1, r1, r2 281 add r0, r0, r2 282 subs r2, r2, #4 283 blt Lmemcpy_bl4 /* less than 4 bytes */ 284 ands r12, r0, #3 285 bne Lmemcpy_bdestul /* oh unaligned destination addr */ 286 ands r12, r1, #3 287 bne Lmemcpy_bsrcul /* oh unaligned source addr */ 288 289Lmemcpy_bt8: 290 /* We have aligned source and destination */ 291 subs r2, r2, #8 292 blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 293 stmdb sp!, {r4} 294 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 295 blt Lmemcpy_bl32 296 297 /* blat 32 bytes at a time */ 298 /* XXX for really big copies perhaps we should use more registers */ 299Lmemcpy_bloop32: 300 ldmdb r1!, {r3, r4, r12, lr} 301 stmdb r0!, {r3, r4, r12, lr} 302 ldmdb r1!, {r3, r4, r12, lr} 303 stmdb r0!, {r3, r4, r12, lr} 304 subs r2, r2, #0x20 305 bge Lmemcpy_bloop32 306 307Lmemcpy_bl32: 308 cmn r2, #0x10 309 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 310 stmgedb r0!, {r3, r4, r12, lr} 311 subge r2, r2, #0x10 312 adds r2, r2, #0x14 313 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 314 stmgedb r0!, {r3, r12, lr} 315 subge r2, r2, #0x0c 316 ldmia sp!, {r4} 317 318Lmemcpy_bl12: 319 adds r2, r2, #8 320 blt Lmemcpy_bl4 321 subs r2, r2, #4 322 ldrlt r3, [r1, #-4]! 323 strlt r3, [r0, #-4]! 324 ldmgedb r1!, {r3, r12} 325 stmgedb r0!, {r3, r12} 326 subge r2, r2, #4 327 328Lmemcpy_bl4: 329 /* less than 4 bytes to go */ 330 adds r2, r2, #4 331 ldmeqia sp!, {r0, pc} 332 333 /* copy the crud byte at a time */ 334 cmp r2, #2 335 ldrb r3, [r1, #-1]! 336 strb r3, [r0, #-1]! 337 ldrgeb r3, [r1, #-1]! 338 strgeb r3, [r0, #-1]! 339 ldrgtb r3, [r1, #-1]! 340 strgtb r3, [r0, #-1]! 341 ldmia sp!, {r0, pc} 342 343 /* erg - unaligned destination */ 344Lmemcpy_bdestul: 345 cmp r12, #2 346 347 /* align destination with byte copies */ 348 ldrb r3, [r1, #-1]! 349 strb r3, [r0, #-1]! 350 ldrgeb r3, [r1, #-1]! 351 strgeb r3, [r0, #-1]! 352 ldrgtb r3, [r1, #-1]! 353 strgtb r3, [r0, #-1]! 354 subs r2, r2, r12 355 blt Lmemcpy_bl4 /* less than 4 bytes to go */ 356 ands r12, r1, #3 357 beq Lmemcpy_bt8 /* we have an aligned source */ 358 359 /* erg - unaligned source */ 360 /* This is where it gets nasty ... */ 361Lmemcpy_bsrcul: 362 bic r1, r1, #3 363 ldr r3, [r1, #0] 364 cmp r12, #2 365 blt Lmemcpy_bsrcul1 366 beq Lmemcpy_bsrcul2 367 cmp r2, #0x0c 368 blt Lmemcpy_bsrcul3loop4 369 sub r2, r2, #0x0c 370 stmdb sp!, {r4, r5} 371 372Lmemcpy_bsrcul3loop16: 373 mov lr, r3, lsl #8 374 ldmdb r1!, {r3-r5, r12} 375 orr lr, lr, r12, lsr #24 376 mov r12, r12, lsl #8 377 orr r12, r12, r5, lsr #24 378 mov r5, r5, lsl #8 379 orr r5, r5, r4, lsr #24 380 mov r4, r4, lsl #8 381 orr r4, r4, r3, lsr #24 382 stmdb r0!, {r4, r5, r12, lr} 383 subs r2, r2, #0x10 384 bge Lmemcpy_bsrcul3loop16 385 ldmia sp!, {r4, r5} 386 adds r2, r2, #0x0c 387 blt Lmemcpy_bsrcul3l4 388 389Lmemcpy_bsrcul3loop4: 390 mov r12, r3, lsl #8 391 ldr r3, [r1, #-4]! 392 orr r12, r12, r3, lsr #24 393 str r12, [r0, #-4]! 394 subs r2, r2, #4 395 bge Lmemcpy_bsrcul3loop4 396 397Lmemcpy_bsrcul3l4: 398 add r1, r1, #3 399 b Lmemcpy_bl4 400 401Lmemcpy_bsrcul2: 402 cmp r2, #0x0c 403 blt Lmemcpy_bsrcul2loop4 404 sub r2, r2, #0x0c 405 stmdb sp!, {r4, r5} 406 407Lmemcpy_bsrcul2loop16: 408 mov lr, r3, lsl #16 409 ldmdb r1!, {r3-r5, r12} 410 orr lr, lr, r12, lsr #16 411 mov r12, r12, lsl #16 412 orr r12, r12, r5, lsr #16 413 mov r5, r5, lsl #16 414 orr r5, r5, r4, lsr #16 415 mov r4, r4, lsl #16 416 orr r4, r4, r3, lsr #16 417 stmdb r0!, {r4, r5, r12, lr} 418 subs r2, r2, #0x10 419 bge Lmemcpy_bsrcul2loop16 420 ldmia sp!, {r4, r5} 421 adds r2, r2, #0x0c 422 blt Lmemcpy_bsrcul2l4 423 424Lmemcpy_bsrcul2loop4: 425 mov r12, r3, lsl #16 426 ldr r3, [r1, #-4]! 427 orr r12, r12, r3, lsr #16 428 str r12, [r0, #-4]! 429 subs r2, r2, #4 430 bge Lmemcpy_bsrcul2loop4 431 432Lmemcpy_bsrcul2l4: 433 add r1, r1, #2 434 b Lmemcpy_bl4 435 436Lmemcpy_bsrcul1: 437 cmp r2, #0x0c 438 blt Lmemcpy_bsrcul1loop4 439 sub r2, r2, #0x0c 440 stmdb sp!, {r4, r5} 441 442Lmemcpy_bsrcul1loop32: 443 mov lr, r3, lsl #24 444 ldmdb r1!, {r3-r5, r12} 445 orr lr, lr, r12, lsr #8 446 mov r12, r12, lsl #24 447 orr r12, r12, r5, lsr #8 448 mov r5, r5, lsl #24 449 orr r5, r5, r4, lsr #8 450 mov r4, r4, lsl #24 451 orr r4, r4, r3, lsr #8 452 stmdb r0!, {r4, r5, r12, lr} 453 subs r2, r2, #0x10 454 bge Lmemcpy_bsrcul1loop32 455 ldmia sp!, {r4, r5} 456 adds r2, r2, #0x0c 457 blt Lmemcpy_bsrcul1l4 458 459Lmemcpy_bsrcul1loop4: 460 mov r12, r3, lsl #24 461 ldr r3, [r1, #-4]! 462 orr r12, r12, r3, lsr #8 463 str r12, [r0, #-4]! 464 subs r2, r2, #4 465 bge Lmemcpy_bsrcul1loop4 466 467Lmemcpy_bsrcul1l4: 468 add r1, r1, #1 469 b Lmemcpy_bl4 470 471