1*adc5085fSryo/* $NetBSD: memset.S,v 1.3 2020/04/11 05:12:52 ryo Exp $ */ 2beb9c6d1Smatt 3beb9c6d1Smatt/*- 4beb9c6d1Smatt * Copyright (c) 2014 The NetBSD Foundation, Inc. 5beb9c6d1Smatt * All rights reserved. 6beb9c6d1Smatt * 7beb9c6d1Smatt * This code is derived from software contributed to The NetBSD Foundation 8beb9c6d1Smatt * by Matt Thomas of 3am Software Foundry. 9beb9c6d1Smatt * 10beb9c6d1Smatt * Redistribution and use in source and binary forms, with or without 11beb9c6d1Smatt * modification, are permitted provided that the following conditions 12beb9c6d1Smatt * are met: 13beb9c6d1Smatt * 1. Redistributions of source code must retain the above copyright 14beb9c6d1Smatt * notice, this list of conditions and the following disclaimer. 15beb9c6d1Smatt * 2. Redistributions in binary form must reproduce the above copyright 16beb9c6d1Smatt * notice, this list of conditions and the following disclaimer in the 17beb9c6d1Smatt * documentation and/or other materials provided with the distribution. 18beb9c6d1Smatt * 19beb9c6d1Smatt * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20beb9c6d1Smatt * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21beb9c6d1Smatt * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22beb9c6d1Smatt * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23beb9c6d1Smatt * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24beb9c6d1Smatt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25beb9c6d1Smatt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26beb9c6d1Smatt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27beb9c6d1Smatt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28beb9c6d1Smatt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29beb9c6d1Smatt * POSSIBILITY OF SUCH DAMAGE. 30beb9c6d1Smatt */ 31beb9c6d1Smatt 32beb9c6d1Smatt#include <machine/asm.h> 33beb9c6d1Smatt 34beb9c6d1SmattENTRY(memset) 35beb9c6d1Smatt cbz x2, .Lret 36beb9c6d1Smatt mov x15, x0 /* working data pointer */ 37beb9c6d1Smatt cbz x1, .Lzerofill 38beb9c6d1Smatt cbz x1, .Lfilled 39beb9c6d1Smatt /* 40beb9c6d1Smatt * Non zero fill, replicate to all 64 bits of x1. 41beb9c6d1Smatt */ 42beb9c6d1Smatt and x1, x1, #0xff 43beb9c6d1Smatt orr x1, x1, x1, lsl #8 44beb9c6d1Smatt orr x1, x1, x1, lsl #16 45beb9c6d1Smatt orr x1, x1, x1, lsl #32 46beb9c6d1Smatt.Lfilled: 47beb9c6d1Smatt cmp x2, #15 /* if it's small, ignore alignment */ 48beb9c6d1Smatt b.ls .Llast_subqword 49beb9c6d1Smatt 50beb9c6d1Smatt mov x6, x1 51beb9c6d1Smatt tst x15, #15 52beb9c6d1Smatt b.eq .Lqword_loop 53beb9c6d1Smatt 54beb9c6d1Smatt/* 55beb9c6d1Smatt * We have at least 15 to copy which means we can get qword alignment 56beb9c6d1Smatt * without having to check the amount left. 57beb9c6d1Smatt */ 58beb9c6d1Smatt tbz x15, #0, .Lhword_aligned 59beb9c6d1Smatt strb w1, [x15], #1 60beb9c6d1Smatt.Lhword_aligned: 61beb9c6d1Smatt tbz x15, #1, .Lword_aligned 62beb9c6d1Smatt strh w1, [x15], #2 63beb9c6d1Smatt.Lword_aligned: 64beb9c6d1Smatt tbz x15, #2, .Ldword_aligned 65beb9c6d1Smatt str w1, [x15], #4 66beb9c6d1Smatt.Ldword_aligned: 67beb9c6d1Smatt tbz x15, #3, .Lqword_aligned 68beb9c6d1Smatt str x1, [x15], #8 69beb9c6d1Smatt/* 70beb9c6d1Smatt * Now we qword aligned. Figure how much we have to write to get here. 71beb9c6d1Smatt * Then subtract from the length. If we get 0, we're done. 72beb9c6d1Smatt */ 73beb9c6d1Smatt.Lqword_aligned: 74beb9c6d1Smatt sub x5, x15, x0 75beb9c6d1Smatt subs x2, x2, x5 76beb9c6d1Smatt b.eq .Lret 77beb9c6d1Smatt 78beb9c6d1Smatt/* 79beb9c6d1Smatt * Write 16 bytes at time. If we don't have 16 bytes to write, bail. 80beb9c6d1Smatt * Keep looping if there's data to set. 81beb9c6d1Smatt */ 82beb9c6d1Smatt.Lqword_loop: 83beb9c6d1Smatt subs x2, x2, #16 84beb9c6d1Smatt b.mi .Llast_subqword 85beb9c6d1Smatt stp x1, x6, [x15], #16 86beb9c6d1Smatt b.ne .Lqword_loop 87beb9c6d1Smatt ret 88beb9c6d1Smatt 89beb9c6d1Smatt/* 90beb9c6d1Smatt * We have less than a qword to write. We hope we are aligned but since 91beb9c6d1Smatt * unaligned access works, we don't have to be aligned. 92beb9c6d1Smatt */ 93beb9c6d1Smatt.Llast_subqword: 94beb9c6d1Smatt tbz x2, #3, .Llast_subdword 95beb9c6d1Smatt str x1, [x15], #8 96beb9c6d1Smatt.Llast_subdword: 97beb9c6d1Smatt tbz x2, #2, .Llast_subword 98beb9c6d1Smatt str w1, [x15], #4 99beb9c6d1Smatt.Llast_subword: 100beb9c6d1Smatt tbz x2, #1, .Llast_subhword 101beb9c6d1Smatt strh w1, [x15], #2 102beb9c6d1Smatt.Llast_subhword: 103beb9c6d1Smatt tbz x2, #0, .Lret 104beb9c6d1Smatt strb w1, [x15] 105beb9c6d1Smatt.Lret: ret 106beb9c6d1Smatt 107beb9c6d1Smatt/* 108beb9c6d1Smatt * If we are filling with zeros then let's see if we can use the 109beb9c6d1Smatt * dc zva, <Xt> 110beb9c6d1Smatt * instruction to speed things up. 111beb9c6d1Smatt */ 112beb9c6d1Smatt.Lzerofill: 113beb9c6d1Smatt mrs x9, dczid_el0 114beb9c6d1Smatt /* 115beb9c6d1Smatt * Make sure we can the instruction isn't prohibited. 116beb9c6d1Smatt */ 117beb9c6d1Smatt tbnz x9, #4, .Lfilled 118beb9c6d1Smatt /* 119beb9c6d1Smatt * Now find out the block size. 120beb9c6d1Smatt */ 121beb9c6d1Smatt ubfx x9, x9, #0, #4 /* extract low 4 bits */ 122beb9c6d1Smatt add x9, x9, #2 /* add log2(word) */ 123beb9c6d1Smatt mov x10, #1 /* the value is log2(words) */ 124beb9c6d1Smatt lsl x10, x10, x9 /* shift to get the block size */ 125beb9c6d1Smatt cmp x2, x10 /* are we even copying a block? */ 126beb9c6d1Smatt b.lt .Lfilled /* no, do it 16 bytes at a time */ 127beb9c6d1Smatt /* 128beb9c6d1Smatt * Now we figure out how many aligned blocks we have 129beb9c6d1Smatt */ 130beb9c6d1Smatt sub x11, x10, #1 /* make block size a mask */ 131beb9c6d1Smatt add x12, x15, x11 /* round start to a block boundary */ 132beb9c6d1Smatt asr x12, x12, x9 /* "starting" block number */ 133beb9c6d1Smatt add x13, x15, x2 /* get ending address */ 134beb9c6d1Smatt asr x13, x13, x9 /* "ending" block numebr */ 135beb9c6d1Smatt cmp x13, x12 /* how many blocks? */ 136782b3eacSryo b.ls .Lfilled /* none, do it 16 bytes at a time */ 137beb9c6d1Smatt 138beb9c6d1Smatt /* 139beb9c6d1Smatt * Now we have one or more blocks to deal with. First now we need 140beb9c6d1Smatt * to get block aligned. 141beb9c6d1Smatt */ 142beb9c6d1Smatt and x7, x15, x11 /* are already aligned on a block boundary? */ 143beb9c6d1Smatt cbz x7, .Lblock_aligned 144beb9c6d1Smatt 145beb9c6d1Smatt sub x7, x10, x7 /* subtract offset from block length */ 146beb9c6d1Smatt sub x2, x2, x7 /* subtract that from length */ 147782b3eacSryo asr x7, x7, #4 /* length -> N*16 */ 148beb9c6d1Smatt 149beb9c6d1Smatt tbz x15, #0, .Lzero_hword_aligned 150beb9c6d1Smatt strb wzr, [x15], #1 151beb9c6d1Smatt.Lzero_hword_aligned: 152beb9c6d1Smatt tbz x15, #1, .Lzero_word_aligned 153beb9c6d1Smatt strh wzr, [x15], #2 154beb9c6d1Smatt.Lzero_word_aligned: 155beb9c6d1Smatt tbz x15, #2, .Lzero_dword_aligned 156beb9c6d1Smatt str wzr, [x15], #4 157beb9c6d1Smatt.Lzero_dword_aligned: 158beb9c6d1Smatt tbz x15, #3, .Lzero_qword_aligned 159beb9c6d1Smatt str xzr, [x15], #8 160beb9c6d1Smatt.Lzero_qword_aligned: 161*adc5085fSryo cbz x7, .Lblock_aligned /* aligned? just branch */ 162782b3eacSryo 163*adc5085fSryo /* align to DCZID_EL0:BS boundary */ 164*adc5085fSryo tbz x7, #0, 0f /* fill 16byte? */ 165beb9c6d1Smatt stp xzr, xzr, [x15], #16 166*adc5085fSryo0: 167*adc5085fSryo tbz x7, #1, 1f /* fill 32byte? */ 168*adc5085fSryo stp xzr, xzr, [x15], #16 169*adc5085fSryo stp xzr, xzr, [x15], #16 170*adc5085fSryo1: 171*adc5085fSryo lsr x7, x7, #2 172*adc5085fSryo cbz x7, 9f 173*adc5085fSryo.L64bytes_fill: 174*adc5085fSryo sub x7, x7, #1 175*adc5085fSryo stp xzr, xzr, [x15], #16 176*adc5085fSryo stp xzr, xzr, [x15], #16 177*adc5085fSryo stp xzr, xzr, [x15], #16 178*adc5085fSryo stp xzr, xzr, [x15], #16 179*adc5085fSryo cbnz x7, .L64bytes_fill 180*adc5085fSryo9: 181beb9c6d1Smatt 182beb9c6d1Smatt/* 183beb9c6d1Smatt * Now we are block aligned. 184beb9c6d1Smatt */ 185beb9c6d1Smatt.Lblock_aligned: 186beb9c6d1Smatt subs x2, x2, x10 187beb9c6d1Smatt b.mi .Lblock_done 188beb9c6d1Smatt dc zva, x15 189beb9c6d1Smatt add x15, x15, x10 190beb9c6d1Smatt b.ne .Lblock_aligned 191beb9c6d1Smatt ret 192beb9c6d1Smatt 193beb9c6d1Smatt.Lblock_done: 194782b3eacSryo and x2, x2, x11 /* make positive again */ 195beb9c6d1Smatt mov x6, xzr /* fill 2nd xword */ 196beb9c6d1Smatt b .Lqword_loop /* and finish filling */ 197beb9c6d1Smatt 198beb9c6d1SmattEND(memset) 199