1*84d9c625SLionel Sambuc/*- 2*84d9c625SLionel Sambuc * Copyright (c) 2013 The NetBSD Foundation, Inc. 3*84d9c625SLionel Sambuc * All rights reserved. 4*84d9c625SLionel Sambuc * 5*84d9c625SLionel Sambuc * This code is derived from software contributed to The NetBSD Foundation 6*84d9c625SLionel Sambuc * by Matt Thomas of 3am Software Foundry. 7*84d9c625SLionel Sambuc * 8*84d9c625SLionel Sambuc * Redistribution and use in source and binary forms, with or without 9*84d9c625SLionel Sambuc * modification, are permitted provided that the following conditions 10*84d9c625SLionel Sambuc * are met: 11*84d9c625SLionel Sambuc * 1. Redistributions of source code must retain the above copyright 12*84d9c625SLionel Sambuc * notice, this list of conditions and the following disclaimer. 13*84d9c625SLionel Sambuc * 2. Redistributions in binary form must reproduce the above copyright 14*84d9c625SLionel Sambuc * notice, this list of conditions and the following disclaimer in the 15*84d9c625SLionel Sambuc * documentation and/or other materials provided with the distribution. 16*84d9c625SLionel Sambuc * 17*84d9c625SLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18*84d9c625SLionel Sambuc * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19*84d9c625SLionel Sambuc * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20*84d9c625SLionel Sambuc * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21*84d9c625SLionel Sambuc * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22*84d9c625SLionel Sambuc * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23*84d9c625SLionel Sambuc * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24*84d9c625SLionel Sambuc * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25*84d9c625SLionel Sambuc * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26*84d9c625SLionel Sambuc * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27*84d9c625SLionel Sambuc * POSSIBILITY OF SUCH DAMAGE. 28*84d9c625SLionel Sambuc */ 29*84d9c625SLionel Sambuc 30*84d9c625SLionel Sambuc#include <machine/asm.h> 31*84d9c625SLionel Sambuc 32*84d9c625SLionel SambucRCSID("$NetBSD: memcpy_neon.S,v 1.1 2013/01/03 09:34:44 matt Exp $") 33*84d9c625SLionel Sambuc 34*84d9c625SLionel Sambuc .text 35*84d9c625SLionel SambucENTRY(memcpy) 36*84d9c625SLionel Sambuc teq r2, #0 /* 0 length? */ 37*84d9c625SLionel Sambuc cmpne r0, r1 /* if not, does src == dst? */ 38*84d9c625SLionel Sambuc RETc(eq) /* yes, (to either) return */ 39*84d9c625SLionel Sambuc 40*84d9c625SLionel Sambuc mov r3, r0 /* keep r0 unchanged */ 41*84d9c625SLionel Sambuc#if 0 42*84d9c625SLionel Sambuc cmp r2, #16 /* copy less than 8 bytes? */ 43*84d9c625SLionel Sambuc bge .Ldst_aligner /* nope, do it the long way */ 44*84d9c625SLionel Sambuc 45*84d9c625SLionel Sambuc1: ldrb ip, [r1], #1 /* load a byte from src */ 46*84d9c625SLionel Sambuc subs r2, r2, #1 /* and more to transfer? */ 47*84d9c625SLionel Sambuc strb ip, [r3], #1 /* save it to dst */ 48*84d9c625SLionel Sambuc bne 1b /* yes, do next byte */ 49*84d9c625SLionel Sambuc RET /* return */ 50*84d9c625SLionel Sambuc#endif 51*84d9c625SLionel Sambuc 52*84d9c625SLionel Sambuc.Ldst_aligner: 53*84d9c625SLionel Sambuc tst r3, #7 /* is dst pointer word aligned? */ 54*84d9c625SLionel Sambuc beq .Lsrc_aligner /* yes, check src pointer */ 55*84d9c625SLionel Sambuc /* 56*84d9c625SLionel Sambuc * Until the dst pointer is word aligned, read src and dst byte by 57*84d9c625SLionel Sambuc * byte until it is aligned or we've copied everything. 58*84d9c625SLionel Sambuc */ 59*84d9c625SLionel Sambuc ldrb ip, [r1], #1 /* load a byte from src */ 60*84d9c625SLionel Sambuc strb ip, [r3], #1 /* save the byte to dst */ 61*84d9c625SLionel Sambuc subs r2, r2, #1 /* end of transfer? */ 62*84d9c625SLionel Sambuc bne .Ldst_aligner /* no, try next byte */ 63*84d9c625SLionel Sambuc RET /* yes, we're done! */ 64*84d9c625SLionel Sambuc 65*84d9c625SLionel Sambuc.Lsrc_aligner: 66*84d9c625SLionel Sambuc push {r4-r5} /* save some registers */ 67*84d9c625SLionel Sambuc add r4, r2, r3 /* keep a pointer to the end of src */ 68*84d9c625SLionel Sambuc ands r5, r1, #7 /* get misalignment of src pointer */ 69*84d9c625SLionel Sambuc beq .Lcongruent_main /* aligned, do it the fast way */ 70*84d9c625SLionel Sambuc 71*84d9c625SLionel Sambuc vdup.8 d1, r5 /* set offset for table */ 72*84d9c625SLionel Sambuc rsb r5, r5, #8 /* calculate leftover of each word */ 73*84d9c625SLionel Sambuc bic r1, r1, #7 /* dword align src pointer */ 74*84d9c625SLionel Sambuc 75*84d9c625SLionel Sambuc vldr d0, .Ltbl_value /* load table value */ 76*84d9c625SLionel Sambuc vadd.u8 d0, d0, d1 /* add offset to it */ 77*84d9c625SLionel Sambuc 78*84d9c625SLionel Sambuc vld1.64 {d1}, [r1:64]! /* load a dword from src */ 79*84d9c625SLionel Sambuc 80*84d9c625SLionel Sambuc cmp r2, r5 /* do we already have enough? */ 81*84d9c625SLionel Sambuc bgt .Lincongruent /* no, so read more */ 82*84d9c625SLionel Sambuc 83*84d9c625SLionel Sambuc.Lincongruent_finish: 84*84d9c625SLionel Sambuc vtbl.8 d0, {d1-d2}, d0 /* merge last dwords */ 85*84d9c625SLionel Sambuc cmp r2, #8 /* room for a full dword? */ 86*84d9c625SLionel Sambuc#ifdef __ARMEB__ 87*84d9c625SLionel Sambuc vrev64.32 d0, d0 /* word swap to LE */ 88*84d9c625SLionel Sambuc#endif 89*84d9c625SLionel Sambuc blt .Lfinish /* no, write final partial dword */ 90*84d9c625SLionel Sambuc vst1.32 {d0}, [r3:64] /* yes, write final full dword */ 91*84d9c625SLionel Sambuc b .Ldone /* and we're done! */ 92*84d9c625SLionel Sambuc 93*84d9c625SLionel Sambuc.Lincongruent: 94*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 95*84d9c625SLionel Sambuc cmp r2, #8 /* can we write a full dword? */ 96*84d9c625SLionel Sambuc blt .Lincongruent_finish /* no, finish it. */ 97*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 98*84d9c625SLionel Sambuc vst1.64 {d1}, [r3:64]! /* store a dword */ 99*84d9c625SLionel Sambuc subs r2, r2, #8 /* have we written everything? */ 100*84d9c625SLionel Sambuc beq .Ldone /* yes, we're done! */ 101*84d9c625SLionel Sambuc vmov d1, d2 /* prepare for next dword */ 102*84d9c625SLionel Sambuc tst r3, #63 /* are we 64-byte aligned? */ 103*84d9c625SLionel Sambuc bne .Lincongruent /* no, load next dword */ 104*84d9c625SLionel Sambuc 105*84d9c625SLionel Sambuc /* 106*84d9c625SLionel Sambuc * We are now 64-byte aligneds so all writes should fill one or more 107*84d9c625SLionel Sambuc * cachelines. Even if d1 has 7 bytes cached, to write 32 bytes we 108*84d9c625SLionel Sambuc * still need to read 4 dwords (3 full dwords and 1 dword for that 109*84d9c625SLionel Sambuc * last byte). 110*84d9c625SLionel Sambuc */ 111*84d9c625SLionel Sambuc cmp r2, #32 /* can we write 4 more dwords? */ 112*84d9c625SLionel Sambuc blt .Lincongruent_dword /* no, handle dword by dword */ 113*84d9c625SLionel Sambuc vld1.64 {d2-d5}, [r1:64]! /* read 4 dwords */ 114*84d9c625SLionel Sambuc cmp r2, #64 /* can we write 4 more dwords? */ 115*84d9c625SLionel Sambuc blt .Lincongruent_4dword /* no, handle it */ 116*84d9c625SLionel Sambuc 117*84d9c625SLionel Sambuc1: vld1.64 {d7-d10}, [r1:64]! /* read 4 dwords */ 118*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 119*84d9c625SLionel Sambuc vtbl.8 d2, {d2-d3}, d0 /* reorder */ 120*84d9c625SLionel Sambuc vtbl.8 d3, {d3-d4}, d0 /* reorder */ 121*84d9c625SLionel Sambuc vtbl.8 d4, {d4-d5}, d0 /* reorder */ 122*84d9c625SLionel Sambuc vst1.64 {d1-d4}, [r3:64]! /* write 4 dwords */ 123*84d9c625SLionel Sambuc vmov d6, d5 /* move out of the way the load */ 124*84d9c625SLionel Sambuc cmp r2, #96 /* have 8+4 dwords to write? */ 125*84d9c625SLionel Sambuc blt 2f /* no more data, skip the load */ 126*84d9c625SLionel Sambuc vld1.64 {d2-d5}, [r1:64]! /* more data, load 4 dwords */ 127*84d9c625SLionel Sambuc2: vtbl.8 d6, {d6-d7}, d0 /* reorder */ 128*84d9c625SLionel Sambuc vtbl.8 d7, {d7-d8}, d0 /* reorder */ 129*84d9c625SLionel Sambuc vtbl.8 d8, {d8-d9}, d0 /* reorder */ 130*84d9c625SLionel Sambuc vtbl.8 d9, {d9-d10}, d0 /* reorder */ 131*84d9c625SLionel Sambuc vst1.64 {d6-d9}, [r3:64]! /* write 4 dwords */ 132*84d9c625SLionel Sambuc subs r2, r2, #64 133*84d9c625SLionel Sambuc beq .Ldone 134*84d9c625SLionel Sambuc vmov d1, d10 135*84d9c625SLionel Sambuc cmp r2, #64 136*84d9c625SLionel Sambuc bge 1b 137*84d9c625SLionel Sambuc 138*84d9c625SLionel Sambuc /* 139*84d9c625SLionel Sambuc * we have leftovers in d1 and new untranslated date in d2-d5. 140*84d9c625SLionel Sambuc */ 141*84d9c625SLionel Sambuc.Lincongruent_4dword: 142*84d9c625SLionel Sambuc cmp r2, #32 143*84d9c625SLionel Sambuc blt .Lincongruent_dword 144*84d9c625SLionel Sambuc 145*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 146*84d9c625SLionel Sambuc vtbl.8 d2, {d2-d3}, d0 /* reorder */ 147*84d9c625SLionel Sambuc vtbl.8 d3, {d3-d4}, d0 /* reorder */ 148*84d9c625SLionel Sambuc vtbl.8 d4, {d4-d5}, d0 /* reorder */ 149*84d9c625SLionel Sambuc vst1.64 {d1-d4}, [r3:64]! /* write 4 dwords */ 150*84d9c625SLionel Sambuc vmov d1, d5 /* move leftovers */ 151*84d9c625SLionel Sambuc subs r2, r2, #32 152*84d9c625SLionel Sambuc beq .Ldone 153*84d9c625SLionel Sambuc 154*84d9c625SLionel Sambuc.Lincongruent_dword: 155*84d9c625SLionel Sambuc#if 0 156*84d9c625SLionel Sambuc cmp r2, r5 /* enough in leftovers? */ 157*84d9c625SLionel Sambuc ble .Lincongruent_finish /* yes, finish it. */ 158*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 159*84d9c625SLionel Sambuc cmp r2, #8 /* can we write a full dword? */ 160*84d9c625SLionel Sambuc blt .Lincongruent_finish /* no, finish it. */ 161*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 162*84d9c625SLionel Sambuc vst1.64 {d1}, [r3:64]! /* store a dword */ 163*84d9c625SLionel Sambuc subs r2, r2, #8 /* have we written everything? */ 164*84d9c625SLionel Sambuc beq .Ldone /* yes, we're done! */ 165*84d9c625SLionel Sambuc b .Lincongruent_dword /* and go get it */ 166*84d9c625SLionel Sambuc#else 167*84d9c625SLionel Sambuc cmp r2, r5 /* are the bytes we have enough? */ 168*84d9c625SLionel Sambuc ble .Lincongruent_finish /* yes, finish it. */ 169*84d9c625SLionel Sambuc mov ip, r2 /* get remaining count */ 170*84d9c625SLionel Sambuc bic ip, ip, #7 /* truncate to a dword */ 171*84d9c625SLionel Sambuc rsb ip, ip, #32 /* subtract from 32 */ 172*84d9c625SLionel Sambuc ands r2, r2, #7 /* count mod 8 */ 173*84d9c625SLionel Sambuc add pc, pc, ip, lsl #1 /* and jump! */ 174*84d9c625SLionel Sambuc nop 175*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 176*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 177*84d9c625SLionel Sambuc vst1.64 {d1}, [r3:64]! /* store a dword */ 178*84d9c625SLionel Sambuc vmov d1, d2 /* prepare for next dword */ 179*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 180*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 181*84d9c625SLionel Sambuc vst1.64 {d1}, [r3:64]! /* store a dword */ 182*84d9c625SLionel Sambuc vmov d1, d2 /* prepare for next dword */ 183*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 184*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 185*84d9c625SLionel Sambuc vst1.64 {d1}, [r3:64]! /* store a dword */ 186*84d9c625SLionel Sambuc vmov d1, d2 /* prepare for next dword */ 187*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 188*84d9c625SLionel Sambuc vtbl.8 d1, {d1-d2}, d0 /* reorder */ 189*84d9c625SLionel Sambuc vst1.64 {d1}, [r3:64]! /* store a dword */ 190*84d9c625SLionel Sambuc vmov d1, d2 /* prepare for next dword */ 191*84d9c625SLionel Sambuc beq .Ldone 192*84d9c625SLionel Sambuc vld1.64 {d2}, [r1:64]! /* load a dword */ 193*84d9c625SLionel Sambuc b .Lincongruent_finish /* write last partial dowrd */ 194*84d9c625SLionel Sambuc#endif 195*84d9c625SLionel Sambuc 196*84d9c625SLionel Sambuc.Lcongruent_main: 197*84d9c625SLionel Sambuc vld1.32 {d0}, [r1:64]! /* load next dword */ 198*84d9c625SLionel Sambuc cmp r2, #8 /* compare current ptr against end */ 199*84d9c625SLionel Sambuc blt .Lfinish /* greater so write final dword */ 200*84d9c625SLionel Sambuc vst1.32 {d0}, [r3:64]! /* store dword */ 201*84d9c625SLionel Sambuc subs r2, r2, #8 /* compare current ptr against end */ 202*84d9c625SLionel Sambuc beq .Ldone /* equal? we're done! */ 203*84d9c625SLionel Sambuc tst r3, #63 /* have we hit a 64-byte boundary? */ 204*84d9c625SLionel Sambuc bne .Lcongruent_main /* no, write next word */ 205*84d9c625SLionel Sambuc 206*84d9c625SLionel Sambuc cmp r2, #64 /* can we write 4 dwords? */ 207*84d9c625SLionel Sambuc blt .Lcongruent_loop /* no, this dword by dword */ 208*84d9c625SLionel Sambuc vldm r1!, {d0-d7} /* load next 7 dwords */ 209*84d9c625SLionel Sambuc cmp r2, #128 /* can we write 16 dwords */ 210*84d9c625SLionel Sambuc blt 3f /* no, then deal with 8 dwords */ 211*84d9c625SLionel Sambuc 212*84d9c625SLionel Sambuc /* 213*84d9c625SLionel Sambuc * The following writes two 64-byte interleaving stores and loads. 214*84d9c625SLionel Sambuc */ 215*84d9c625SLionel Sambuc1: vldm r1!, {d8-d15} /* load next 8 dwords */ 216*84d9c625SLionel Sambuc vstm r3!, {d0-d7} /* store 8 more dwords */ 217*84d9c625SLionel Sambuc cmp r2, #192 /* can we write 16+8 dwords? */ 218*84d9c625SLionel Sambuc blt 2f /* no, don't load the next 8 dwords */ 219*84d9c625SLionel Sambuc vldm r1!, {d0-d7} /* yes, load next 8 dwords */ 220*84d9c625SLionel Sambuc2: vstm r3!, {d8-d15} /* store 8 more dwords */ 221*84d9c625SLionel Sambuc sub r2, r2, #128 /* we just stored 16 (8+8) dwords */ 222*84d9c625SLionel Sambuc beq .Ldone /* if 0, we're done! */ 223*84d9c625SLionel Sambuc cmp r2, #128 /* can we write 16 dwords */ 224*84d9c625SLionel Sambuc bge 1b /* yes, do it again */ 225*84d9c625SLionel Sambuc cmp r2, #64 /* have we loaded 8 dwords? */ 226*84d9c625SLionel Sambuc blt .Lcongruent_loop /* no, proceed to do it dword */ 227*84d9c625SLionel Sambuc 228*84d9c625SLionel Sambuc /* 229*84d9c625SLionel Sambuc * We now have 8 dwords we can write in d0-d7. 230*84d9c625SLionel Sambuc */ 231*84d9c625SLionel Sambuc3: vstm r3!, {d0-d7} /* store 8 more dwords */ 232*84d9c625SLionel Sambuc subs r2, r2, #64 /* we wrote 8 dwords */ 233*84d9c625SLionel Sambuc beq .Ldone /* if 0, we're done! */ 234*84d9c625SLionel Sambuc 235*84d9c625SLionel Sambuc.Lcongruent_loop: 236*84d9c625SLionel Sambuc vld1.32 {d0}, [r1]! /* load dword from src */ 237*84d9c625SLionel Sambuc cmp r2, #8 /* can we write a full dword? */ 238*84d9c625SLionel Sambuc blt .Lfinish /* no, write last partial dword */ 239*84d9c625SLionel Sambuc.Lcongruent_loop_start: 240*84d9c625SLionel Sambuc vst1.32 {d0}, [r3]! /* store dword into dst */ 241*84d9c625SLionel Sambuc subs r2, r2, #8 /* subtract it from length */ 242*84d9c625SLionel Sambuc beq .Ldone /* if 0, we're done! */ 243*84d9c625SLionel Sambuc vld1.32 {d0}, [r1]! /* load dword from src */ 244*84d9c625SLionel Sambuc cmp r2, #8 /* can we write a full dword? */ 245*84d9c625SLionel Sambuc bge .Lcongruent_loop_start /* yes, so do it */ 246*84d9c625SLionel Sambuc 247*84d9c625SLionel Sambuc.Lfinish: 248*84d9c625SLionel Sambuc vmov r4, r5, d0 /* get last dword from NEON */ 249*84d9c625SLionel Sambuc tst r2, #4 /* do we have at least 4 bytes left? */ 250*84d9c625SLionel Sambuc strne r4, [r3], #4 /* store the 1st word */ 251*84d9c625SLionel Sambuc movne r4, r5 /* move 2nd word into place */ 252*84d9c625SLionel Sambuc tst r2, #2 /* do we have at least 2 bytes left? */ 253*84d9c625SLionel Sambuc#ifdef __ARMEB__ 254*84d9c625SLionel Sambuc movne r4, r4, ror #16 /* yes, swap halfwords */ 255*84d9c625SLionel Sambuc#endif 256*84d9c625SLionel Sambuc strneh r4, [r3], #2 /* yes, store the halfword */ 257*84d9c625SLionel Sambuc#ifdef __ARMEL__ 258*84d9c625SLionel Sambuc movne r4, r4, lsr #16 /* yes, discard just written bytes */ 259*84d9c625SLionel Sambuc#endif 260*84d9c625SLionel Sambuc tst r2, #1 /* do we have a final byte? */ 261*84d9c625SLionel Sambuc#ifdef __ARMEB__ 262*84d9c625SLionel Sambuc movne r4, r4, lsr #24 /* yes, move MSB to LSB */ 263*84d9c625SLionel Sambuc#endif 264*84d9c625SLionel Sambuc strneb r4, [r3], #1 /* yes, store it */ 265*84d9c625SLionel Sambuc 266*84d9c625SLionel Sambuc.Ldone: 267*84d9c625SLionel Sambuc pop {r4-r5} /* restore registers */ 268*84d9c625SLionel Sambuc RET 269*84d9c625SLionel Sambuc 270*84d9c625SLionel Sambuc .p2align 3 271*84d9c625SLionel Sambuc.Ltbl_value: 272*84d9c625SLionel Sambuc#ifdef __ARMEL__ 273*84d9c625SLionel Sambuc .quad 0x0706050403020100 274*84d9c625SLionel Sambuc#else 275*84d9c625SLionel Sambuc .quad 0x0001020304050607 276*84d9c625SLionel Sambuc#endif 277*84d9c625SLionel SambucEND(memcpy) 278