1*b6cbf720SGianluca Guida/* $NetBSD: memcpy_arm.S,v 1.2 2008/04/28 20:22:52 martin Exp $ */ 2*b6cbf720SGianluca Guida 3*b6cbf720SGianluca Guida/*- 4*b6cbf720SGianluca Guida * Copyright (c) 1997 The NetBSD Foundation, Inc. 5*b6cbf720SGianluca Guida * All rights reserved. 6*b6cbf720SGianluca Guida * 7*b6cbf720SGianluca Guida * This code is derived from software contributed to The NetBSD Foundation 8*b6cbf720SGianluca Guida * by Neil A. Carson and Mark Brinicombe 9*b6cbf720SGianluca Guida * 10*b6cbf720SGianluca Guida * Redistribution and use in source and binary forms, with or without 11*b6cbf720SGianluca Guida * modification, are permitted provided that the following conditions 12*b6cbf720SGianluca Guida * are met: 13*b6cbf720SGianluca Guida * 1. Redistributions of source code must retain the above copyright 14*b6cbf720SGianluca Guida * notice, this list of conditions and the following disclaimer. 15*b6cbf720SGianluca Guida * 2. Redistributions in binary form must reproduce the above copyright 16*b6cbf720SGianluca Guida * notice, this list of conditions and the following disclaimer in the 17*b6cbf720SGianluca Guida * documentation and/or other materials provided with the distribution. 18*b6cbf720SGianluca Guida * 19*b6cbf720SGianluca Guida * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20*b6cbf720SGianluca Guida * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21*b6cbf720SGianluca Guida * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22*b6cbf720SGianluca Guida * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23*b6cbf720SGianluca Guida * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24*b6cbf720SGianluca Guida * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25*b6cbf720SGianluca Guida * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26*b6cbf720SGianluca Guida * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27*b6cbf720SGianluca Guida * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28*b6cbf720SGianluca Guida * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29*b6cbf720SGianluca Guida * POSSIBILITY OF SUCH DAMAGE. 30*b6cbf720SGianluca Guida */ 31*b6cbf720SGianluca Guida 32*b6cbf720SGianluca Guida#include <machine/asm.h> 33*b6cbf720SGianluca Guida 34*b6cbf720SGianluca Guida/* 35*b6cbf720SGianluca Guida * This is one fun bit of code ... 36*b6cbf720SGianluca Guida * Some easy listening music is suggested while trying to understand this 37*b6cbf720SGianluca Guida * code e.g. Iron Maiden 38*b6cbf720SGianluca Guida * 39*b6cbf720SGianluca Guida * For anyone attempting to understand it : 40*b6cbf720SGianluca Guida * 41*b6cbf720SGianluca Guida * The core code is implemented here with simple stubs for memcpy(). 42*b6cbf720SGianluca Guida * 43*b6cbf720SGianluca Guida * All local labels are prefixed with Lmemcpy_ 44*b6cbf720SGianluca Guida * Following the prefix a label starting f is used in the forward copy code 45*b6cbf720SGianluca Guida * while a label using b is used in the backwards copy code 46*b6cbf720SGianluca Guida * The source and destination addresses determine whether a forward or 47*b6cbf720SGianluca Guida * backward copy is performed. 48*b6cbf720SGianluca Guida * Separate bits of code are used to deal with the following situations 49*b6cbf720SGianluca Guida * for both the forward and backwards copy. 50*b6cbf720SGianluca Guida * unaligned source address 51*b6cbf720SGianluca Guida * unaligned destination address 52*b6cbf720SGianluca Guida * Separate copy routines are used to produce an optimised result for each 53*b6cbf720SGianluca Guida * of these cases. 54*b6cbf720SGianluca Guida * The copy code will use LDM/STM instructions to copy up to 32 bytes at 55*b6cbf720SGianluca Guida * a time where possible. 56*b6cbf720SGianluca Guida * 57*b6cbf720SGianluca Guida * Note: r12 (aka ip) can be trashed during the function along with 58*b6cbf720SGianluca Guida * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 59*b6cbf720SGianluca Guida * Additional registers are preserved prior to use i.e. r4, r5 & lr 60*b6cbf720SGianluca Guida * 61*b6cbf720SGianluca Guida * Apologies for the state of the comments ;-) 62*b6cbf720SGianluca Guida */ 63*b6cbf720SGianluca Guida/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */ 64*b6cbf720SGianluca GuidaENTRY(memcpy) 65*b6cbf720SGianluca Guida /* save leaf functions having to store this away */ 66*b6cbf720SGianluca Guida stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 67*b6cbf720SGianluca Guida 68*b6cbf720SGianluca Guida subs r2, r2, #4 69*b6cbf720SGianluca Guida blt .Lmemcpy_l4 /* less than 4 bytes */ 70*b6cbf720SGianluca Guida ands r12, r0, #3 71*b6cbf720SGianluca Guida bne .Lmemcpy_destul /* oh unaligned destination addr */ 72*b6cbf720SGianluca Guida ands r12, r1, #3 73*b6cbf720SGianluca Guida bne .Lmemcpy_srcul /* oh unaligned source addr */ 74*b6cbf720SGianluca Guida 75*b6cbf720SGianluca Guida.Lmemcpy_t8: 76*b6cbf720SGianluca Guida /* We have aligned source and destination */ 77*b6cbf720SGianluca Guida subs r2, r2, #8 78*b6cbf720SGianluca Guida blt .Lmemcpy_l12 /* less than 12 bytes (4 from above) */ 79*b6cbf720SGianluca Guida subs r2, r2, #0x14 80*b6cbf720SGianluca Guida blt .Lmemcpy_l32 /* less than 32 bytes (12 from above) */ 81*b6cbf720SGianluca Guida stmdb sp!, {r4} /* borrow r4 */ 82*b6cbf720SGianluca Guida 83*b6cbf720SGianluca Guida /* blat 32 bytes at a time */ 84*b6cbf720SGianluca Guida /* XXX for really big copies perhaps we should use more registers */ 85*b6cbf720SGianluca Guida.Lmemcpy_loop32: 86*b6cbf720SGianluca Guida ldmia r1!, {r3, r4, r12, lr} 87*b6cbf720SGianluca Guida stmia r0!, {r3, r4, r12, lr} 88*b6cbf720SGianluca Guida ldmia r1!, {r3, r4, r12, lr} 89*b6cbf720SGianluca Guida stmia r0!, {r3, r4, r12, lr} 90*b6cbf720SGianluca Guida subs r2, r2, #0x20 91*b6cbf720SGianluca Guida bge .Lmemcpy_loop32 92*b6cbf720SGianluca Guida 93*b6cbf720SGianluca Guida cmn r2, #0x10 94*b6cbf720SGianluca Guida ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 95*b6cbf720SGianluca Guida stmgeia r0!, {r3, r4, r12, lr} 96*b6cbf720SGianluca Guida subge r2, r2, #0x10 97*b6cbf720SGianluca Guida ldmia sp!, {r4} /* return r4 */ 98*b6cbf720SGianluca Guida 99*b6cbf720SGianluca Guida.Lmemcpy_l32: 100*b6cbf720SGianluca Guida adds r2, r2, #0x14 101*b6cbf720SGianluca Guida 102*b6cbf720SGianluca Guida /* blat 12 bytes at a time */ 103*b6cbf720SGianluca Guida.Lmemcpy_loop12: 104*b6cbf720SGianluca Guida ldmgeia r1!, {r3, r12, lr} 105*b6cbf720SGianluca Guida stmgeia r0!, {r3, r12, lr} 106*b6cbf720SGianluca Guida subges r2, r2, #0x0c 107*b6cbf720SGianluca Guida bge .Lmemcpy_loop12 108*b6cbf720SGianluca Guida 109*b6cbf720SGianluca Guida.Lmemcpy_l12: 110*b6cbf720SGianluca Guida adds r2, r2, #8 111*b6cbf720SGianluca Guida blt .Lmemcpy_l4 112*b6cbf720SGianluca Guida 113*b6cbf720SGianluca Guida subs r2, r2, #4 114*b6cbf720SGianluca Guida ldrlt r3, [r1], #4 115*b6cbf720SGianluca Guida strlt r3, [r0], #4 116*b6cbf720SGianluca Guida ldmgeia r1!, {r3, r12} 117*b6cbf720SGianluca Guida stmgeia r0!, {r3, r12} 118*b6cbf720SGianluca Guida subge r2, r2, #4 119*b6cbf720SGianluca Guida 120*b6cbf720SGianluca Guida.Lmemcpy_l4: 121*b6cbf720SGianluca Guida /* less than 4 bytes to go */ 122*b6cbf720SGianluca Guida adds r2, r2, #4 123*b6cbf720SGianluca Guida#ifdef __APCS_26_ 124*b6cbf720SGianluca Guida ldmeqia sp!, {r0, pc}^ /* done */ 125*b6cbf720SGianluca Guida#else 126*b6cbf720SGianluca Guida ldmeqia sp!, {r0, pc} /* done */ 127*b6cbf720SGianluca Guida#endif 128*b6cbf720SGianluca Guida /* copy the crud byte at a time */ 129*b6cbf720SGianluca Guida cmp r2, #2 130*b6cbf720SGianluca Guida ldrb r3, [r1], #1 131*b6cbf720SGianluca Guida strb r3, [r0], #1 132*b6cbf720SGianluca Guida ldrgeb r3, [r1], #1 133*b6cbf720SGianluca Guida strgeb r3, [r0], #1 134*b6cbf720SGianluca Guida ldrgtb r3, [r1], #1 135*b6cbf720SGianluca Guida strgtb r3, [r0], #1 136*b6cbf720SGianluca Guida ldmia sp!, {r0, pc} 137*b6cbf720SGianluca Guida 138*b6cbf720SGianluca Guida /* erg - unaligned destination */ 139*b6cbf720SGianluca Guida.Lmemcpy_destul: 140*b6cbf720SGianluca Guida rsb r12, r12, #4 141*b6cbf720SGianluca Guida cmp r12, #2 142*b6cbf720SGianluca Guida 143*b6cbf720SGianluca Guida /* align destination with byte copies */ 144*b6cbf720SGianluca Guida ldrb r3, [r1], #1 145*b6cbf720SGianluca Guida strb r3, [r0], #1 146*b6cbf720SGianluca Guida ldrgeb r3, [r1], #1 147*b6cbf720SGianluca Guida strgeb r3, [r0], #1 148*b6cbf720SGianluca Guida ldrgtb r3, [r1], #1 149*b6cbf720SGianluca Guida strgtb r3, [r0], #1 150*b6cbf720SGianluca Guida subs r2, r2, r12 151*b6cbf720SGianluca Guida blt .Lmemcpy_l4 /* less the 4 bytes */ 152*b6cbf720SGianluca Guida 153*b6cbf720SGianluca Guida ands r12, r1, #3 154*b6cbf720SGianluca Guida beq .Lmemcpy_t8 /* we have an aligned source */ 155*b6cbf720SGianluca Guida 156*b6cbf720SGianluca Guida /* erg - unaligned source */ 157*b6cbf720SGianluca Guida /* This is where it gets nasty ... */ 158*b6cbf720SGianluca Guida.Lmemcpy_srcul: 159*b6cbf720SGianluca Guida bic r1, r1, #3 160*b6cbf720SGianluca Guida ldr lr, [r1], #4 161*b6cbf720SGianluca Guida cmp r12, #2 162*b6cbf720SGianluca Guida bgt .Lmemcpy_srcul3 163*b6cbf720SGianluca Guida beq .Lmemcpy_srcul2 164*b6cbf720SGianluca Guida cmp r2, #0x0c 165*b6cbf720SGianluca Guida blt .Lmemcpy_srcul1loop4 166*b6cbf720SGianluca Guida sub r2, r2, #0x0c 167*b6cbf720SGianluca Guida stmdb sp!, {r4, r5} 168*b6cbf720SGianluca Guida 169*b6cbf720SGianluca Guida.Lmemcpy_srcul1loop16: 170*b6cbf720SGianluca Guida#ifdef __ARMEB__ 171*b6cbf720SGianluca Guida mov r3, lr, lsl #8 172*b6cbf720SGianluca Guida#else 173*b6cbf720SGianluca Guida mov r3, lr, lsr #8 174*b6cbf720SGianluca Guida#endif 175*b6cbf720SGianluca Guida ldmia r1!, {r4, r5, r12, lr} 176*b6cbf720SGianluca Guida#ifdef __ARMEB__ 177*b6cbf720SGianluca Guida orr r3, r3, r4, lsr #24 178*b6cbf720SGianluca Guida mov r4, r4, lsl #8 179*b6cbf720SGianluca Guida orr r4, r4, r5, lsr #24 180*b6cbf720SGianluca Guida mov r5, r5, lsl #8 181*b6cbf720SGianluca Guida orr r5, r5, r12, lsr #24 182*b6cbf720SGianluca Guida mov r12, r12, lsl #8 183*b6cbf720SGianluca Guida orr r12, r12, lr, lsr #24 184*b6cbf720SGianluca Guida#else 185*b6cbf720SGianluca Guida orr r3, r3, r4, lsl #24 186*b6cbf720SGianluca Guida mov r4, r4, lsr #8 187*b6cbf720SGianluca Guida orr r4, r4, r5, lsl #24 188*b6cbf720SGianluca Guida mov r5, r5, lsr #8 189*b6cbf720SGianluca Guida orr r5, r5, r12, lsl #24 190*b6cbf720SGianluca Guida mov r12, r12, lsr #8 191*b6cbf720SGianluca Guida orr r12, r12, lr, lsl #24 192*b6cbf720SGianluca Guida#endif 193*b6cbf720SGianluca Guida stmia r0!, {r3-r5, r12} 194*b6cbf720SGianluca Guida subs r2, r2, #0x10 195*b6cbf720SGianluca Guida bge .Lmemcpy_srcul1loop16 196*b6cbf720SGianluca Guida ldmia sp!, {r4, r5} 197*b6cbf720SGianluca Guida adds r2, r2, #0x0c 198*b6cbf720SGianluca Guida blt .Lmemcpy_srcul1l4 199*b6cbf720SGianluca Guida 200*b6cbf720SGianluca Guida.Lmemcpy_srcul1loop4: 201*b6cbf720SGianluca Guida#ifdef __ARMEB__ 202*b6cbf720SGianluca Guida mov r12, lr, lsl #8 203*b6cbf720SGianluca Guida#else 204*b6cbf720SGianluca Guida mov r12, lr, lsr #8 205*b6cbf720SGianluca Guida#endif 206*b6cbf720SGianluca Guida ldr lr, [r1], #4 207*b6cbf720SGianluca Guida#ifdef __ARMEB__ 208*b6cbf720SGianluca Guida orr r12, r12, lr, lsr #24 209*b6cbf720SGianluca Guida#else 210*b6cbf720SGianluca Guida orr r12, r12, lr, lsl #24 211*b6cbf720SGianluca Guida#endif 212*b6cbf720SGianluca Guida str r12, [r0], #4 213*b6cbf720SGianluca Guida subs r2, r2, #4 214*b6cbf720SGianluca Guida bge .Lmemcpy_srcul1loop4 215*b6cbf720SGianluca Guida 216*b6cbf720SGianluca Guida.Lmemcpy_srcul1l4: 217*b6cbf720SGianluca Guida sub r1, r1, #3 218*b6cbf720SGianluca Guida b .Lmemcpy_l4 219*b6cbf720SGianluca Guida 220*b6cbf720SGianluca Guida.Lmemcpy_srcul2: 221*b6cbf720SGianluca Guida cmp r2, #0x0c 222*b6cbf720SGianluca Guida blt .Lmemcpy_srcul2loop4 223*b6cbf720SGianluca Guida sub r2, r2, #0x0c 224*b6cbf720SGianluca Guida stmdb sp!, {r4, r5} 225*b6cbf720SGianluca Guida 226*b6cbf720SGianluca Guida.Lmemcpy_srcul2loop16: 227*b6cbf720SGianluca Guida#ifdef __ARMEB__ 228*b6cbf720SGianluca Guida mov r3, lr, lsl #16 229*b6cbf720SGianluca Guida#else 230*b6cbf720SGianluca Guida mov r3, lr, lsr #16 231*b6cbf720SGianluca Guida#endif 232*b6cbf720SGianluca Guida ldmia r1!, {r4, r5, r12, lr} 233*b6cbf720SGianluca Guida#ifdef __ARMEB__ 234*b6cbf720SGianluca Guida orr r3, r3, r4, lsr #16 235*b6cbf720SGianluca Guida mov r4, r4, lsl #16 236*b6cbf720SGianluca Guida orr r4, r4, r5, lsr #16 237*b6cbf720SGianluca Guida mov r5, r5, lsl #16 238*b6cbf720SGianluca Guida orr r5, r5, r12, lsr #16 239*b6cbf720SGianluca Guida mov r12, r12, lsl #16 240*b6cbf720SGianluca Guida orr r12, r12, lr, lsr #16 241*b6cbf720SGianluca Guida#else 242*b6cbf720SGianluca Guida orr r3, r3, r4, lsl #16 243*b6cbf720SGianluca Guida mov r4, r4, lsr #16 244*b6cbf720SGianluca Guida orr r4, r4, r5, lsl #16 245*b6cbf720SGianluca Guida mov r5, r5, lsr #16 246*b6cbf720SGianluca Guida orr r5, r5, r12, lsl #16 247*b6cbf720SGianluca Guida mov r12, r12, lsr #16 248*b6cbf720SGianluca Guida orr r12, r12, lr, lsl #16 249*b6cbf720SGianluca Guida#endif 250*b6cbf720SGianluca Guida stmia r0!, {r3-r5, r12} 251*b6cbf720SGianluca Guida subs r2, r2, #0x10 252*b6cbf720SGianluca Guida bge .Lmemcpy_srcul2loop16 253*b6cbf720SGianluca Guida ldmia sp!, {r4, r5} 254*b6cbf720SGianluca Guida adds r2, r2, #0x0c 255*b6cbf720SGianluca Guida blt .Lmemcpy_srcul2l4 256*b6cbf720SGianluca Guida 257*b6cbf720SGianluca Guida.Lmemcpy_srcul2loop4: 258*b6cbf720SGianluca Guida#ifdef __ARMEB__ 259*b6cbf720SGianluca Guida mov r12, lr, lsl #16 260*b6cbf720SGianluca Guida#else 261*b6cbf720SGianluca Guida mov r12, lr, lsr #16 262*b6cbf720SGianluca Guida#endif 263*b6cbf720SGianluca Guida ldr lr, [r1], #4 264*b6cbf720SGianluca Guida#ifdef __ARMEB__ 265*b6cbf720SGianluca Guida orr r12, r12, lr, lsr #16 266*b6cbf720SGianluca Guida#else 267*b6cbf720SGianluca Guida orr r12, r12, lr, lsl #16 268*b6cbf720SGianluca Guida#endif 269*b6cbf720SGianluca Guida str r12, [r0], #4 270*b6cbf720SGianluca Guida subs r2, r2, #4 271*b6cbf720SGianluca Guida bge .Lmemcpy_srcul2loop4 272*b6cbf720SGianluca Guida 273*b6cbf720SGianluca Guida.Lmemcpy_srcul2l4: 274*b6cbf720SGianluca Guida sub r1, r1, #2 275*b6cbf720SGianluca Guida b .Lmemcpy_l4 276*b6cbf720SGianluca Guida 277*b6cbf720SGianluca Guida.Lmemcpy_srcul3: 278*b6cbf720SGianluca Guida cmp r2, #0x0c 279*b6cbf720SGianluca Guida blt .Lmemcpy_srcul3loop4 280*b6cbf720SGianluca Guida sub r2, r2, #0x0c 281*b6cbf720SGianluca Guida stmdb sp!, {r4, r5} 282*b6cbf720SGianluca Guida 283*b6cbf720SGianluca Guida.Lmemcpy_srcul3loop16: 284*b6cbf720SGianluca Guida#ifdef __ARMEB__ 285*b6cbf720SGianluca Guida mov r3, lr, lsl #24 286*b6cbf720SGianluca Guida#else 287*b6cbf720SGianluca Guida mov r3, lr, lsr #24 288*b6cbf720SGianluca Guida#endif 289*b6cbf720SGianluca Guida ldmia r1!, {r4, r5, r12, lr} 290*b6cbf720SGianluca Guida#ifdef __ARMEB__ 291*b6cbf720SGianluca Guida orr r3, r3, r4, lsr #8 292*b6cbf720SGianluca Guida mov r4, r4, lsl #24 293*b6cbf720SGianluca Guida orr r4, r4, r5, lsr #8 294*b6cbf720SGianluca Guida mov r5, r5, lsl #24 295*b6cbf720SGianluca Guida orr r5, r5, r12, lsr #8 296*b6cbf720SGianluca Guida mov r12, r12, lsl #24 297*b6cbf720SGianluca Guida orr r12, r12, lr, lsr #8 298*b6cbf720SGianluca Guida#else 299*b6cbf720SGianluca Guida orr r3, r3, r4, lsl #8 300*b6cbf720SGianluca Guida mov r4, r4, lsr #24 301*b6cbf720SGianluca Guida orr r4, r4, r5, lsl #8 302*b6cbf720SGianluca Guida mov r5, r5, lsr #24 303*b6cbf720SGianluca Guida orr r5, r5, r12, lsl #8 304*b6cbf720SGianluca Guida mov r12, r12, lsr #24 305*b6cbf720SGianluca Guida orr r12, r12, lr, lsl #8 306*b6cbf720SGianluca Guida#endif 307*b6cbf720SGianluca Guida stmia r0!, {r3-r5, r12} 308*b6cbf720SGianluca Guida subs r2, r2, #0x10 309*b6cbf720SGianluca Guida bge .Lmemcpy_srcul3loop16 310*b6cbf720SGianluca Guida ldmia sp!, {r4, r5} 311*b6cbf720SGianluca Guida adds r2, r2, #0x0c 312*b6cbf720SGianluca Guida blt .Lmemcpy_srcul3l4 313*b6cbf720SGianluca Guida 314*b6cbf720SGianluca Guida.Lmemcpy_srcul3loop4: 315*b6cbf720SGianluca Guida#ifdef __ARMEB__ 316*b6cbf720SGianluca Guida mov r12, lr, lsl #24 317*b6cbf720SGianluca Guida#else 318*b6cbf720SGianluca Guida mov r12, lr, lsr #24 319*b6cbf720SGianluca Guida#endif 320*b6cbf720SGianluca Guida ldr lr, [r1], #4 321*b6cbf720SGianluca Guida#ifdef __ARMEB__ 322*b6cbf720SGianluca Guida orr r12, r12, lr, lsr #8 323*b6cbf720SGianluca Guida#else 324*b6cbf720SGianluca Guida orr r12, r12, lr, lsl #8 325*b6cbf720SGianluca Guida#endif 326*b6cbf720SGianluca Guida str r12, [r0], #4 327*b6cbf720SGianluca Guida subs r2, r2, #4 328*b6cbf720SGianluca Guida bge .Lmemcpy_srcul3loop4 329*b6cbf720SGianluca Guida 330*b6cbf720SGianluca Guida.Lmemcpy_srcul3l4: 331*b6cbf720SGianluca Guida sub r1, r1, #1 332*b6cbf720SGianluca Guida b .Lmemcpy_l4 333