1*0a6a1f1dSLionel Sambuc/* $NetBSD: memset.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */ 2*0a6a1f1dSLionel Sambuc 3*0a6a1f1dSLionel Sambuc/*- 4*0a6a1f1dSLionel Sambuc * Copyright (c) 2014 The NetBSD Foundation, Inc. 5*0a6a1f1dSLionel Sambuc * All rights reserved. 6*0a6a1f1dSLionel Sambuc * 7*0a6a1f1dSLionel Sambuc * This code is derived from software contributed to The NetBSD Foundation 8*0a6a1f1dSLionel Sambuc * by Matt Thomas of 3am Software Foundry. 9*0a6a1f1dSLionel Sambuc * 10*0a6a1f1dSLionel Sambuc * Redistribution and use in source and binary forms, with or without 11*0a6a1f1dSLionel Sambuc * modification, are permitted provided that the following conditions 12*0a6a1f1dSLionel Sambuc * are met: 13*0a6a1f1dSLionel Sambuc * 1. Redistributions of source code must retain the above copyright 14*0a6a1f1dSLionel Sambuc * notice, this list of conditions and the following disclaimer. 15*0a6a1f1dSLionel Sambuc * 2. Redistributions in binary form must reproduce the above copyright 16*0a6a1f1dSLionel Sambuc * notice, this list of conditions and the following disclaimer in the 17*0a6a1f1dSLionel Sambuc * documentation and/or other materials provided with the distribution. 18*0a6a1f1dSLionel Sambuc * 19*0a6a1f1dSLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20*0a6a1f1dSLionel Sambuc * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21*0a6a1f1dSLionel Sambuc * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22*0a6a1f1dSLionel Sambuc * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23*0a6a1f1dSLionel Sambuc * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24*0a6a1f1dSLionel Sambuc * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25*0a6a1f1dSLionel Sambuc * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26*0a6a1f1dSLionel Sambuc * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27*0a6a1f1dSLionel Sambuc * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28*0a6a1f1dSLionel Sambuc * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29*0a6a1f1dSLionel Sambuc * POSSIBILITY OF SUCH DAMAGE. 30*0a6a1f1dSLionel Sambuc */ 31*0a6a1f1dSLionel Sambuc 32*0a6a1f1dSLionel Sambuc#include <machine/asm.h> 33*0a6a1f1dSLionel Sambuc 34*0a6a1f1dSLionel SambucENTRY(memset) 35*0a6a1f1dSLionel Sambuc cbz x2, .Lret 36*0a6a1f1dSLionel Sambuc mov x15, x0 /* working data pointer */ 37*0a6a1f1dSLionel Sambuc cbz x1, .Lzerofill 38*0a6a1f1dSLionel Sambuc cbz x1, .Lfilled 39*0a6a1f1dSLionel Sambuc /* 40*0a6a1f1dSLionel Sambuc * Non zero fill, replicate to all 64 bits of x1. 41*0a6a1f1dSLionel Sambuc */ 42*0a6a1f1dSLionel Sambuc and x1, x1, #0xff 43*0a6a1f1dSLionel Sambuc orr x1, x1, x1, lsl #8 44*0a6a1f1dSLionel Sambuc orr x1, x1, x1, lsl #16 45*0a6a1f1dSLionel Sambuc orr x1, x1, x1, lsl #32 46*0a6a1f1dSLionel Sambuc.Lfilled: 47*0a6a1f1dSLionel Sambuc cmp x2, #15 /* if it's small, ignore alignment */ 48*0a6a1f1dSLionel Sambuc b.ls .Llast_subqword 49*0a6a1f1dSLionel Sambuc 50*0a6a1f1dSLionel Sambuc mov x6, x1 51*0a6a1f1dSLionel Sambuc tst x15, #15 52*0a6a1f1dSLionel Sambuc b.eq .Lqword_loop 53*0a6a1f1dSLionel Sambuc 54*0a6a1f1dSLionel Sambuc/* 55*0a6a1f1dSLionel Sambuc * We have at least 15 to copy which means we can get qword alignment 56*0a6a1f1dSLionel Sambuc * without having to check the amount left. 57*0a6a1f1dSLionel Sambuc */ 58*0a6a1f1dSLionel Sambuc tbz x15, #0, .Lhword_aligned 59*0a6a1f1dSLionel Sambuc strb w1, [x15], #1 60*0a6a1f1dSLionel Sambuc.Lhword_aligned: 61*0a6a1f1dSLionel Sambuc tbz x15, #1, .Lword_aligned 62*0a6a1f1dSLionel Sambuc strh w1, [x15], #2 63*0a6a1f1dSLionel Sambuc.Lword_aligned: 64*0a6a1f1dSLionel Sambuc tbz x15, #2, .Ldword_aligned 65*0a6a1f1dSLionel Sambuc str w1, [x15], #4 66*0a6a1f1dSLionel Sambuc.Ldword_aligned: 67*0a6a1f1dSLionel Sambuc tbz x15, #3, .Lqword_aligned 68*0a6a1f1dSLionel Sambuc str x1, [x15], #8 69*0a6a1f1dSLionel Sambuc/* 70*0a6a1f1dSLionel Sambuc * Now we qword aligned. Figure how much we have to write to get here. 71*0a6a1f1dSLionel Sambuc * Then subtract from the length. If we get 0, we're done. 72*0a6a1f1dSLionel Sambuc */ 73*0a6a1f1dSLionel Sambuc.Lqword_aligned: 74*0a6a1f1dSLionel Sambuc sub x5, x15, x0 75*0a6a1f1dSLionel Sambuc subs x2, x2, x5 76*0a6a1f1dSLionel Sambuc b.eq .Lret 77*0a6a1f1dSLionel Sambuc 78*0a6a1f1dSLionel Sambuc/* 79*0a6a1f1dSLionel Sambuc * Write 16 bytes at time. If we don't have 16 bytes to write, bail. 80*0a6a1f1dSLionel Sambuc * Keep looping if there's data to set. 81*0a6a1f1dSLionel Sambuc */ 82*0a6a1f1dSLionel Sambuc.Lqword_loop: 83*0a6a1f1dSLionel Sambuc subs x2, x2, #16 84*0a6a1f1dSLionel Sambuc b.mi .Llast_subqword 85*0a6a1f1dSLionel Sambuc stp x1, x6, [x15], #16 86*0a6a1f1dSLionel Sambuc b.ne .Lqword_loop 87*0a6a1f1dSLionel Sambuc ret 88*0a6a1f1dSLionel Sambuc 89*0a6a1f1dSLionel Sambuc/* 90*0a6a1f1dSLionel Sambuc * We have less than a qword to write. We hope we are aligned but since 91*0a6a1f1dSLionel Sambuc * unaligned access works, we don't have to be aligned. 92*0a6a1f1dSLionel Sambuc */ 93*0a6a1f1dSLionel Sambuc.Llast_subqword: 94*0a6a1f1dSLionel Sambuc tbz x2, #3, .Llast_subdword 95*0a6a1f1dSLionel Sambuc str x1, [x15], #8 96*0a6a1f1dSLionel Sambuc.Llast_subdword: 97*0a6a1f1dSLionel Sambuc tbz x2, #2, .Llast_subword 98*0a6a1f1dSLionel Sambuc str w1, [x15], #4 99*0a6a1f1dSLionel Sambuc.Llast_subword: 100*0a6a1f1dSLionel Sambuc tbz x2, #1, .Llast_subhword 101*0a6a1f1dSLionel Sambuc strh w1, [x15], #2 102*0a6a1f1dSLionel Sambuc.Llast_subhword: 103*0a6a1f1dSLionel Sambuc tbz x2, #0, .Lret 104*0a6a1f1dSLionel Sambuc strb w1, [x15] 105*0a6a1f1dSLionel Sambuc.Lret: ret 106*0a6a1f1dSLionel Sambuc 107*0a6a1f1dSLionel Sambuc/* 108*0a6a1f1dSLionel Sambuc * If we are filling with zeros then let's see if we can use the 109*0a6a1f1dSLionel Sambuc * dc zva, <Xt> 110*0a6a1f1dSLionel Sambuc * instruction to speed things up. 111*0a6a1f1dSLionel Sambuc */ 112*0a6a1f1dSLionel Sambuc.Lzerofill: 113*0a6a1f1dSLionel Sambuc mrs x9, dczid_el0 114*0a6a1f1dSLionel Sambuc /* 115*0a6a1f1dSLionel Sambuc * Make sure we can the instruction isn't prohibited. 116*0a6a1f1dSLionel Sambuc */ 117*0a6a1f1dSLionel Sambuc tbnz x9, #4, .Lfilled 118*0a6a1f1dSLionel Sambuc /* 119*0a6a1f1dSLionel Sambuc * Now find out the block size. 120*0a6a1f1dSLionel Sambuc */ 121*0a6a1f1dSLionel Sambuc ubfx x9, x9, #0, #4 /* extract low 4 bits */ 122*0a6a1f1dSLionel Sambuc add x9, x9, #2 /* add log2(word) */ 123*0a6a1f1dSLionel Sambuc mov x10, #1 /* the value is log2(words) */ 124*0a6a1f1dSLionel Sambuc lsl x10, x10, x9 /* shift to get the block size */ 125*0a6a1f1dSLionel Sambuc cmp x2, x10 /* are we even copying a block? */ 126*0a6a1f1dSLionel Sambuc b.lt .Lfilled /* no, do it 16 bytes at a time */ 127*0a6a1f1dSLionel Sambuc /* 128*0a6a1f1dSLionel Sambuc * Now we figure out how many aligned blocks we have 129*0a6a1f1dSLionel Sambuc */ 130*0a6a1f1dSLionel Sambuc sub x11, x10, #1 /* make block size a mask */ 131*0a6a1f1dSLionel Sambuc add x12, x15, x11 /* round start to a block boundary */ 132*0a6a1f1dSLionel Sambuc asr x12, x12, x9 /* "starting" block number */ 133*0a6a1f1dSLionel Sambuc add x13, x15, x2 /* get ending address */ 134*0a6a1f1dSLionel Sambuc asr x13, x13, x9 /* "ending" block numebr */ 135*0a6a1f1dSLionel Sambuc cmp x13, x12 /* how many blocks? */ 136*0a6a1f1dSLionel Sambuc b.eq .Lfilled /* none, do it 16 bytes at a time */ 137*0a6a1f1dSLionel Sambuc 138*0a6a1f1dSLionel Sambuc /* 139*0a6a1f1dSLionel Sambuc * Now we have one or more blocks to deal with. First now we need 140*0a6a1f1dSLionel Sambuc * to get block aligned. 141*0a6a1f1dSLionel Sambuc */ 142*0a6a1f1dSLionel Sambuc and x7, x15, x11 /* are already aligned on a block boundary? */ 143*0a6a1f1dSLionel Sambuc cbz x7, .Lblock_aligned 144*0a6a1f1dSLionel Sambuc 145*0a6a1f1dSLionel Sambuc sub x7, x10, x7 /* subtract offset from block length */ 146*0a6a1f1dSLionel Sambuc sub x2, x2, x7 /* subtract that from length */ 147*0a6a1f1dSLionel Sambuc asr x7, x7, #2 /* qword -> word */ 148*0a6a1f1dSLionel Sambuc 149*0a6a1f1dSLionel Sambuc tbz x15, #0, .Lzero_hword_aligned 150*0a6a1f1dSLionel Sambuc strb wzr, [x15], #1 151*0a6a1f1dSLionel Sambuc.Lzero_hword_aligned: 152*0a6a1f1dSLionel Sambuc tbz x15, #1, .Lzero_word_aligned 153*0a6a1f1dSLionel Sambuc strh wzr, [x15], #2 154*0a6a1f1dSLionel Sambuc.Lzero_word_aligned: 155*0a6a1f1dSLionel Sambuc tbz x15, #2, .Lzero_dword_aligned 156*0a6a1f1dSLionel Sambuc str wzr, [x15], #4 157*0a6a1f1dSLionel Sambuc.Lzero_dword_aligned: 158*0a6a1f1dSLionel Sambuc tbz x15, #3, .Lzero_qword_aligned 159*0a6a1f1dSLionel Sambuc str xzr, [x15], #8 160*0a6a1f1dSLionel Sambuc.Lzero_qword_aligned: 161*0a6a1f1dSLionel Sambuc cbz x7, .Lblock_aligned /* no qwords? just branch */ 162*0a6a1f1dSLionel Sambuc adr x6, .Lblock_aligned 163*0a6a1f1dSLionel Sambuc sub x6, x6, x7 /* backup to write the last N qwords */ 164*0a6a1f1dSLionel Sambuc br x6 /* and do it */ 165*0a6a1f1dSLionel Sambuc /* 166*0a6a1f1dSLionel Sambuc * This is valid for cache lines <= 256 bytes. 167*0a6a1f1dSLionel Sambuc */ 168*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 169*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 170*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 171*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 172*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 173*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 174*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 175*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 176*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 177*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 178*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 179*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 180*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 181*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 182*0a6a1f1dSLionel Sambuc stp xzr, xzr, [x15], #16 183*0a6a1f1dSLionel Sambuc 184*0a6a1f1dSLionel Sambuc/* 185*0a6a1f1dSLionel Sambuc * Now we are block aligned. 186*0a6a1f1dSLionel Sambuc */ 187*0a6a1f1dSLionel Sambuc.Lblock_aligned: 188*0a6a1f1dSLionel Sambuc subs x2, x2, x10 189*0a6a1f1dSLionel Sambuc b.mi .Lblock_done 190*0a6a1f1dSLionel Sambuc dc zva, x15 191*0a6a1f1dSLionel Sambuc add x15, x15, x10 192*0a6a1f1dSLionel Sambuc b.ne .Lblock_aligned 193*0a6a1f1dSLionel Sambuc ret 194*0a6a1f1dSLionel Sambuc 195*0a6a1f1dSLionel Sambuc.Lblock_done: 196*0a6a1f1dSLionel Sambuc and x2, x2, x12 /* make positive again */ 197*0a6a1f1dSLionel Sambuc mov x6, xzr /* fill 2nd xword */ 198*0a6a1f1dSLionel Sambuc b .Lqword_loop /* and finish filling */ 199*0a6a1f1dSLionel Sambuc 200*0a6a1f1dSLionel SambucEND(memset) 201