1/* $OpenBSD: _memcpy.S,v 1.2 2004/02/01 05:40:52 drahn Exp $ */ 2/* $NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40#include <machine/asm.h> 41 42/* 43 * This is one fun bit of code ... 44 * Some easy listening music is suggested while trying to understand this 45 * code e.g. Iron Maiden 46 * 47 * For anyone attempting to understand it : 48 * 49 * The core code is implemented here with simple stubs for memcpy() 50 * memmove() and bcopy(). 51 * 52 * All local labels are prefixed with Lmemcpy_ 53 * Following the prefix a label starting f is used in the forward copy code 54 * while a label using b is used in the backwards copy code 55 * The source and destination addresses determine whether a forward or 56 * backward copy is performed. 57 * Separate bits of code are used to deal with the following situations 58 * for both the forward and backwards copy. 59 * unaligned source address 60 * unaligned destination address 61 * Separate copy routines are used to produce an optimised result for each 62 * of these cases. 63 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 64 * a time where possible. 65 * 66 * Note: r12 (aka ip) can be trashed during the function along with 67 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 68 * Additional registers are preserved prior to use i.e. r4, r5 & lr 69 * 70 * Apologies for the state of the comments ;-) 71 */ 72 73ENTRY(_memcpy) 74 /* Determine copy direction */ 75 cmp r1, r0 76 bcc .Lmemcpy_backwards 77 78 moveq r0, #0 /* Quick abort for len=0 */ 79 moveq pc, lr 80 81 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 82 subs r2, r2, #4 83 blt .Lmemcpy_fl4 /* less than 4 bytes */ 84 ands r12, r0, #3 85 bne .Lmemcpy_fdestul /* oh unaligned destination addr */ 86 ands r12, r1, #3 87 bne .Lmemcpy_fsrcul /* oh unaligned source addr */ 88 89.Lmemcpy_ft8: 90 /* We have aligned source and destination */ 91 subs r2, r2, #8 92 blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 93 subs r2, r2, #0x14 94 blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 95 stmdb sp!, {r4} /* borrow r4 */ 96 97 /* blat 32 bytes at a time */ 98 /* XXX for really big copies perhaps we should use more registers */ 99.Lmemcpy_floop32: 100 ldmia r1!, {r3, r4, r12, lr} 101 stmia r0!, {r3, r4, r12, lr} 102 ldmia r1!, {r3, r4, r12, lr} 103 stmia r0!, {r3, r4, r12, lr} 104 subs r2, r2, #0x20 105 bge .Lmemcpy_floop32 106 107 cmn r2, #0x10 108 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 109 stmgeia r0!, {r3, r4, r12, lr} 110 subge r2, r2, #0x10 111 ldmia sp!, {r4} /* return r4 */ 112 113.Lmemcpy_fl32: 114 adds r2, r2, #0x14 115 116 /* blat 12 bytes at a time */ 117.Lmemcpy_floop12: 118 ldmgeia r1!, {r3, r12, lr} 119 stmgeia r0!, {r3, r12, lr} 120 subges r2, r2, #0x0c 121 bge .Lmemcpy_floop12 122 123.Lmemcpy_fl12: 124 adds r2, r2, #8 125 blt .Lmemcpy_fl4 126 127 subs r2, r2, #4 128 ldrlt r3, [r1], #4 129 strlt r3, [r0], #4 130 ldmgeia r1!, {r3, r12} 131 stmgeia r0!, {r3, r12} 132 subge r2, r2, #4 133 134.Lmemcpy_fl4: 135 /* less than 4 bytes to go */ 136 adds r2, r2, #4 137 ldmeqia sp!, {r0, pc} /* done */ 138 139 /* copy the crud byte at a time */ 140 cmp r2, #2 141 ldrb r3, [r1], #1 142 strb r3, [r0], #1 143 ldrgeb r3, [r1], #1 144 strgeb r3, [r0], #1 145 ldrgtb r3, [r1], #1 146 strgtb r3, [r0], #1 147 ldmia sp!, {r0, pc} 148 149 /* erg - unaligned destination */ 150.Lmemcpy_fdestul: 151 rsb r12, r12, #4 152 cmp r12, #2 153 154 /* align destination with byte copies */ 155 ldrb r3, [r1], #1 156 strb r3, [r0], #1 157 ldrgeb r3, [r1], #1 158 strgeb r3, [r0], #1 159 ldrgtb r3, [r1], #1 160 strgtb r3, [r0], #1 161 subs r2, r2, r12 162 blt .Lmemcpy_fl4 /* less the 4 bytes */ 163 164 ands r12, r1, #3 165 beq .Lmemcpy_ft8 /* we have an aligned source */ 166 167 /* erg - unaligned source */ 168 /* This is where it gets nasty ... */ 169.Lmemcpy_fsrcul: 170 bic r1, r1, #3 171 ldr lr, [r1], #4 172 cmp r12, #2 173 bgt .Lmemcpy_fsrcul3 174 beq .Lmemcpy_fsrcul2 175 cmp r2, #0x0c 176 blt .Lmemcpy_fsrcul1loop4 177 sub r2, r2, #0x0c 178 stmdb sp!, {r4, r5} 179 180.Lmemcpy_fsrcul1loop16: 181 mov r3, lr, lsr #8 182 ldmia r1!, {r4, r5, r12, lr} 183 orr r3, r3, r4, lsl #24 184 mov r4, r4, lsr #8 185 orr r4, r4, r5, lsl #24 186 mov r5, r5, lsr #8 187 orr r5, r5, r12, lsl #24 188 mov r12, r12, lsr #8 189 orr r12, r12, lr, lsl #24 190 stmia r0!, {r3-r5, r12} 191 subs r2, r2, #0x10 192 bge .Lmemcpy_fsrcul1loop16 193 ldmia sp!, {r4, r5} 194 adds r2, r2, #0x0c 195 blt .Lmemcpy_fsrcul1l4 196 197.Lmemcpy_fsrcul1loop4: 198 mov r12, lr, lsr #8 199 ldr lr, [r1], #4 200 orr r12, r12, lr, lsl #24 201 str r12, [r0], #4 202 subs r2, r2, #4 203 bge .Lmemcpy_fsrcul1loop4 204 205.Lmemcpy_fsrcul1l4: 206 sub r1, r1, #3 207 b .Lmemcpy_fl4 208 209.Lmemcpy_fsrcul2: 210 cmp r2, #0x0c 211 blt .Lmemcpy_fsrcul2loop4 212 sub r2, r2, #0x0c 213 stmdb sp!, {r4, r5} 214 215.Lmemcpy_fsrcul2loop16: 216 mov r3, lr, lsr #16 217 ldmia r1!, {r4, r5, r12, lr} 218 orr r3, r3, r4, lsl #16 219 mov r4, r4, lsr #16 220 orr r4, r4, r5, lsl #16 221 mov r5, r5, lsr #16 222 orr r5, r5, r12, lsl #16 223 mov r12, r12, lsr #16 224 orr r12, r12, lr, lsl #16 225 stmia r0!, {r3-r5, r12} 226 subs r2, r2, #0x10 227 bge .Lmemcpy_fsrcul2loop16 228 ldmia sp!, {r4, r5} 229 adds r2, r2, #0x0c 230 blt .Lmemcpy_fsrcul2l4 231 232.Lmemcpy_fsrcul2loop4: 233 mov r12, lr, lsr #16 234 ldr lr, [r1], #4 235 orr r12, r12, lr, lsl #16 236 str r12, [r0], #4 237 subs r2, r2, #4 238 bge .Lmemcpy_fsrcul2loop4 239 240.Lmemcpy_fsrcul2l4: 241 sub r1, r1, #2 242 b .Lmemcpy_fl4 243 244.Lmemcpy_fsrcul3: 245 cmp r2, #0x0c 246 blt .Lmemcpy_fsrcul3loop4 247 sub r2, r2, #0x0c 248 stmdb sp!, {r4, r5} 249 250.Lmemcpy_fsrcul3loop16: 251 mov r3, lr, lsr #24 252 ldmia r1!, {r4, r5, r12, lr} 253 orr r3, r3, r4, lsl #8 254 mov r4, r4, lsr #24 255 orr r4, r4, r5, lsl #8 256 mov r5, r5, lsr #24 257 orr r5, r5, r12, lsl #8 258 mov r12, r12, lsr #24 259 orr r12, r12, lr, lsl #8 260 stmia r0!, {r3-r5, r12} 261 subs r2, r2, #0x10 262 bge .Lmemcpy_fsrcul3loop16 263 ldmia sp!, {r4, r5} 264 adds r2, r2, #0x0c 265 blt .Lmemcpy_fsrcul3l4 266 267.Lmemcpy_fsrcul3loop4: 268 mov r12, lr, lsr #24 269 ldr lr, [r1], #4 270 orr r12, r12, lr, lsl #8 271 str r12, [r0], #4 272 subs r2, r2, #4 273 bge .Lmemcpy_fsrcul3loop4 274 275.Lmemcpy_fsrcul3l4: 276 sub r1, r1, #1 277 b .Lmemcpy_fl4 278 279.Lmemcpy_backwards: 280 add r1, r1, r2 281 add r0, r0, r2 282 subs r2, r2, #4 283 blt .Lmemcpy_bl4 /* less than 4 bytes */ 284 ands r12, r0, #3 285 bne .Lmemcpy_bdestul /* oh unaligned destination addr */ 286 ands r12, r1, #3 287 bne .Lmemcpy_bsrcul /* oh unaligned source addr */ 288 289.Lmemcpy_bt8: 290 /* We have aligned source and destination */ 291 subs r2, r2, #8 292 blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 293 stmdb sp!, {r4, lr} 294 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 295 blt .Lmemcpy_bl32 296 297 /* blat 32 bytes at a time */ 298 /* XXX for really big copies perhaps we should use more registers */ 299.Lmemcpy_bloop32: 300 ldmdb r1!, {r3, r4, r12, lr} 301 stmdb r0!, {r3, r4, r12, lr} 302 ldmdb r1!, {r3, r4, r12, lr} 303 stmdb r0!, {r3, r4, r12, lr} 304 subs r2, r2, #0x20 305 bge .Lmemcpy_bloop32 306 307.Lmemcpy_bl32: 308 cmn r2, #0x10 309 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 310 stmgedb r0!, {r3, r4, r12, lr} 311 subge r2, r2, #0x10 312 adds r2, r2, #0x14 313 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 314 stmgedb r0!, {r3, r12, lr} 315 subge r2, r2, #0x0c 316 ldmia sp!, {r4, lr} 317 318.Lmemcpy_bl12: 319 adds r2, r2, #8 320 blt .Lmemcpy_bl4 321 subs r2, r2, #4 322 ldrlt r3, [r1, #-4]! 323 strlt r3, [r0, #-4]! 324 ldmgedb r1!, {r3, r12} 325 stmgedb r0!, {r3, r12} 326 subge r2, r2, #4 327 328.Lmemcpy_bl4: 329 /* less than 4 bytes to go */ 330 adds r2, r2, #4 331 moveq pc, lr /* done */ 332 333 /* copy the crud byte at a time */ 334 cmp r2, #2 335 ldrb r3, [r1, #-1]! 336 strb r3, [r0, #-1]! 337 ldrgeb r3, [r1, #-1]! 338 strgeb r3, [r0, #-1]! 339 ldrgtb r3, [r1, #-1]! 340 strgtb r3, [r0, #-1]! 341 mov pc, lr 342 343 /* erg - unaligned destination */ 344.Lmemcpy_bdestul: 345 cmp r12, #2 346 347 /* align destination with byte copies */ 348 ldrb r3, [r1, #-1]! 349 strb r3, [r0, #-1]! 350 ldrgeb r3, [r1, #-1]! 351 strgeb r3, [r0, #-1]! 352 ldrgtb r3, [r1, #-1]! 353 strgtb r3, [r0, #-1]! 354 subs r2, r2, r12 355 blt .Lmemcpy_bl4 /* less than 4 bytes to go */ 356 ands r12, r1, #3 357 beq .Lmemcpy_bt8 /* we have an aligned source */ 358 359 /* erg - unaligned source */ 360 /* This is where it gets nasty ... */ 361.Lmemcpy_bsrcul: 362 bic r1, r1, #3 363 ldr r3, [r1, #0] 364 cmp r12, #2 365 blt .Lmemcpy_bsrcul1 366 beq .Lmemcpy_bsrcul2 367 cmp r2, #0x0c 368 blt .Lmemcpy_bsrcul3loop4 369 sub r2, r2, #0x0c 370 stmdb sp!, {r4, r5, lr} 371 372.Lmemcpy_bsrcul3loop16: 373 mov lr, r3, lsl #8 374 ldmdb r1!, {r3-r5, r12} 375 orr lr, lr, r12, lsr #24 376 mov r12, r12, lsl #8 377 orr r12, r12, r5, lsr #24 378 mov r5, r5, lsl #8 379 orr r5, r5, r4, lsr #24 380 mov r4, r4, lsl #8 381 orr r4, r4, r3, lsr #24 382 stmdb r0!, {r4, r5, r12, lr} 383 subs r2, r2, #0x10 384 bge .Lmemcpy_bsrcul3loop16 385 ldmia sp!, {r4, r5, lr} 386 adds r2, r2, #0x0c 387 blt .Lmemcpy_bsrcul3l4 388 389.Lmemcpy_bsrcul3loop4: 390 mov r12, r3, lsl #8 391 ldr r3, [r1, #-4]! 392 orr r12, r12, r3, lsr #24 393 str r12, [r0, #-4]! 394 subs r2, r2, #4 395 bge .Lmemcpy_bsrcul3loop4 396 397.Lmemcpy_bsrcul3l4: 398 add r1, r1, #3 399 b .Lmemcpy_bl4 400 401.Lmemcpy_bsrcul2: 402 cmp r2, #0x0c 403 blt .Lmemcpy_bsrcul2loop4 404 sub r2, r2, #0x0c 405 stmdb sp!, {r4, r5, lr} 406 407.Lmemcpy_bsrcul2loop16: 408 mov lr, r3, lsl #16 409 ldmdb r1!, {r3-r5, r12} 410 orr lr, lr, r12, lsr #16 411 mov r12, r12, lsl #16 412 orr r12, r12, r5, lsr #16 413 mov r5, r5, lsl #16 414 orr r5, r5, r4, lsr #16 415 mov r4, r4, lsl #16 416 orr r4, r4, r3, lsr #16 417 stmdb r0!, {r4, r5, r12, lr} 418 subs r2, r2, #0x10 419 bge .Lmemcpy_bsrcul2loop16 420 ldmia sp!, {r4, r5, lr} 421 adds r2, r2, #0x0c 422 blt .Lmemcpy_bsrcul2l4 423 424.Lmemcpy_bsrcul2loop4: 425 mov r12, r3, lsl #16 426 ldr r3, [r1, #-4]! 427 orr r12, r12, r3, lsr #16 428 str r12, [r0, #-4]! 429 subs r2, r2, #4 430 bge .Lmemcpy_bsrcul2loop4 431 432.Lmemcpy_bsrcul2l4: 433 add r1, r1, #2 434 b .Lmemcpy_bl4 435 436.Lmemcpy_bsrcul1: 437 cmp r2, #0x0c 438 blt .Lmemcpy_bsrcul1loop4 439 sub r2, r2, #0x0c 440 stmdb sp!, {r4, r5, lr} 441 442.Lmemcpy_bsrcul1loop32: 443 mov lr, r3, lsl #24 444 ldmdb r1!, {r3-r5, r12} 445 orr lr, lr, r12, lsr #8 446 mov r12, r12, lsl #24 447 orr r12, r12, r5, lsr #8 448 mov r5, r5, lsl #24 449 orr r5, r5, r4, lsr #8 450 mov r4, r4, lsl #24 451 orr r4, r4, r3, lsr #8 452 stmdb r0!, {r4, r5, r12, lr} 453 subs r2, r2, #0x10 454 bge .Lmemcpy_bsrcul1loop32 455 ldmia sp!, {r4, r5, lr} 456 adds r2, r2, #0x0c 457 blt .Lmemcpy_bsrcul1l4 458 459.Lmemcpy_bsrcul1loop4: 460 mov r12, r3, lsl #24 461 ldr r3, [r1, #-4]! 462 orr r12, r12, r3, lsr #8 463 str r12, [r0, #-4]! 464 subs r2, r2, #4 465 bge .Lmemcpy_bsrcul1loop4 466 467.Lmemcpy_bsrcul1l4: 468 add r1, r1, #1 469 b .Lmemcpy_bl4 470