1*28c67577Sguenther/* $OpenBSD: strlen.S,v 1.10 2022/12/07 19:26:39 guenther Exp $ */ 28c688dc9Sreyk/* $NetBSD: strlen.S,v 1.6 2014/03/22 19:16:34 jakllsch Exp $ */ 38c688dc9Sreyk 48c688dc9Sreyk/*- 58c688dc9Sreyk * Copyright (c) 2009 The NetBSD Foundation, Inc. 68c688dc9Sreyk * All rights reserved. 78c688dc9Sreyk * 88c688dc9Sreyk * This code is derived from software contributed to The NetBSD Foundation 98c688dc9Sreyk * by David Laight. 108c688dc9Sreyk * 118c688dc9Sreyk * Redistribution and use in source and binary forms, with or without 128c688dc9Sreyk * modification, are permitted provided that the following conditions 138c688dc9Sreyk * are met: 148c688dc9Sreyk * 1. Redistributions of source code must retain the above copyright 158c688dc9Sreyk * notice, this list of conditions and the following disclaimer. 168c688dc9Sreyk * 2. Redistributions in binary form must reproduce the above copyright 178c688dc9Sreyk * notice, this list of conditions and the following disclaimer in the 188c688dc9Sreyk * documentation and/or other materials provided with the distribution. 198c688dc9Sreyk * 208c688dc9Sreyk * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 218c688dc9Sreyk * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 228c688dc9Sreyk * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 238c688dc9Sreyk * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 248c688dc9Sreyk * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 258c688dc9Sreyk * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 268c688dc9Sreyk * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 278c688dc9Sreyk * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 288c688dc9Sreyk * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 298c688dc9Sreyk * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 308c688dc9Sreyk * POSSIBILITY OF SUCH DAMAGE. 318c688dc9Sreyk */ 328c688dc9Sreyk 338c688dc9Sreyk/* 348c688dc9Sreyk * Inspired by a version written by J.T. Conklin <jtc@acorntoolworks.com> 358c688dc9Sreyk * (Only the long comment really remains his work!) 368c688dc9Sreyk */ 378c688dc9Sreyk 388c688dc9Sreyk#include <machine/asm.h> 398c688dc9Sreyk 408c688dc9Sreyk/* 418c688dc9Sreyk * There are many well known branch-free sequences which are used 428c688dc9Sreyk * for determining whether a zero-byte is contained within a word. 43d771b0cbSjsg * These sequences are generally much more efficient than loading 448c688dc9Sreyk * and comparing each byte individually. 458c688dc9Sreyk * 468c688dc9Sreyk * The expression [1,2]: 478c688dc9Sreyk * 488c688dc9Sreyk * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f)) 498c688dc9Sreyk * 508c688dc9Sreyk * evaluates to a non-zero value if any of the bytes in the 518c688dc9Sreyk * original word is zero. 528c688dc9Sreyk * 538c688dc9Sreyk * It also has the useful property that bytes in the result word 548c688dc9Sreyk * that correspond to non-zero bytes in the original word have 558c688dc9Sreyk * the value 0x00, while bytes corresponding to zero bytes have 568c688dc9Sreyk * the value 0x80. This allows calculation of the first (and 578c688dc9Sreyk * last) occurrence of a zero byte within the word (useful for C's 588c688dc9Sreyk * str* primitives) by counting the number of leading (or 598c688dc9Sreyk * trailing) zeros and dividing the result by 8. On machines 608c688dc9Sreyk * without (or with slow) clz() / ctz() instructions, testing 618c688dc9Sreyk * each byte in the result word for zero is necessary. 628c688dc9Sreyk * 638c688dc9Sreyk * This typically takes 4 instructions (5 on machines without 648c688dc9Sreyk * "not-or") not including those needed to load the constant. 658c688dc9Sreyk * 668c688dc9Sreyk * 678c688dc9Sreyk * The expression: 688c688dc9Sreyk * 698c688dc9Sreyk * (2) ((x - 0x01....01) & 0x80....80 & ~x) 708c688dc9Sreyk * 718c688dc9Sreyk * evaluates to a non-zero value if any of the bytes in the 728c688dc9Sreyk * original word is zero. 738c688dc9Sreyk * 748c688dc9Sreyk * On little endian machines, the first byte in the result word 758c688dc9Sreyk * that corresponds to a zero byte in the original byte is 0x80, 768c688dc9Sreyk * so clz() can be used as above. On big endian machines, and 778c688dc9Sreyk * little endian machines without (or with a slow) clz() insn, 788c688dc9Sreyk * testing each byte in the original for zero is necessary. 798c688dc9Sreyk * 808c688dc9Sreyk * This typically takes 3 instructions (4 on machines without 818c688dc9Sreyk * "and with complement") not including those needed to load 828c688dc9Sreyk * constants. 838c688dc9Sreyk * 848c688dc9Sreyk * 858c688dc9Sreyk * The expression: 868c688dc9Sreyk * 878c688dc9Sreyk * (3) ((x - 0x01....01) & 0x80....80) 888c688dc9Sreyk * 898c688dc9Sreyk * always evaluates to a non-zero value if any of the bytes in 908c688dc9Sreyk * the original word is zero or has the top bit set. 918c688dc9Sreyk * For strings that are likely to only contain 7-bit ascii these 928c688dc9Sreyk * false positives will be rare. 938c688dc9Sreyk * 948c688dc9Sreyk * To account for possible false positives, each byte of the 958c688dc9Sreyk * original word must be checked when the expression evaluates to 968c688dc9Sreyk * a non-zero value. However, because it is simpler than those 978c688dc9Sreyk * presented above, code that uses it will be faster as long as 988c688dc9Sreyk * the rate of false positives is low. 998c688dc9Sreyk * 100c05e1f5dSkrw * This is likely, because the false positive can only occur 101d771b0cbSjsg * if the most significant bit of a byte within the word is set. 1028c688dc9Sreyk * The expression will never fail for typical 7-bit ASCII strings. 1038c688dc9Sreyk * 1048c688dc9Sreyk * This typically takes 2 instructions not including those needed 1058c688dc9Sreyk * to load constants. 1068c688dc9Sreyk * 1078c688dc9Sreyk * 1088c688dc9Sreyk * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 1098c688dc9Sreyk * 1108c688dc9Sreyk * [2] International Business Machines, "The PowerPC Compiler Writer's 1118c688dc9Sreyk * Guide", Warthman Associates, 1996 1128c688dc9Sreyk */ 1138c688dc9Sreyk 1148c688dc9SreykENTRY(strlen) 1151d66f0a0Smortimer RETGUARD_SETUP(strlen, r10) 1168c688dc9Sreyk movabsq $0x0101010101010101,%r8 1178c688dc9Sreyk 1188c688dc9Sreyk test $7,%dil 1198c688dc9Sreyk movq %rdi,%rax /* Buffer, %rdi unchanged */ 1208c688dc9Sreyk movabsq $0x8080808080808080,%r9 1218c688dc9Sreyk jnz 10f /* Jump if misaligned */ 1228c688dc9Sreyk 1238c688dc9Sreyk _ALIGN_TEXT 1248c688dc9Sreyk1: 1258c688dc9Sreyk movq (%rax),%rdx /* get bytes to check */ 1268c688dc9Sreyk2: 1278c688dc9Sreyk addq $8,%rax 1288c688dc9Sreyk mov %rdx,%rcx /* save for later check */ 1298c688dc9Sreyk subq %r8,%rdx /* alg (3) above first */ 1308c688dc9Sreyk not %rcx /* Invert of data */ 1318c688dc9Sreyk andq %r9,%rdx 1328c688dc9Sreyk je 1b /* jump if all 0x01-0x80 */ 1338c688dc9Sreyk 1348c688dc9Sreyk /* Do check from alg (2) above - loops for 0x81..0xff bytes */ 1358c688dc9Sreyk andq %rcx,%rdx 1368c688dc9Sreyk je 1b 1378c688dc9Sreyk 1388c688dc9Sreyk /* Since we are LE, use bit scan for first 0x80 byte */ 1398c688dc9Sreyk sub %rdi,%rax /* length to next word */ 1408c688dc9Sreyk bsf %rdx,%rdx /* 7, 15, 23 ... 63 */ 1418c688dc9Sreyk shr $3,%rdx /* 0, 1, 2 ... 7 */ 1428c688dc9Sreyk lea -8(%rax,%rdx),%rax 1431d66f0a0Smortimer RETGUARD_CHECK(strlen, r10) 1448c688dc9Sreyk ret 145fc541c5dSguenther lfence 1468c688dc9Sreyk 1478c688dc9Sreyk/* Misaligned, read aligned word and make low bytes non-zero */ 148b8dcfd0bSguenther _ALIGN_TRAPS 1498c688dc9Sreyk10: 1508c688dc9Sreyk mov %al,%cl 1518c688dc9Sreyk mov $1,%rsi 1528c688dc9Sreyk and $7,%cl /* offset into word 1..7 */ 1538c688dc9Sreyk and $~7,%al /* start of word with buffer */ 1548c688dc9Sreyk shl $3,%cl /* bit count 8, 16 .. 56 */ 1558c688dc9Sreyk movq (%rax),%rdx /* first data in high bytes */ 1568c688dc9Sreyk shl %cl,%rsi 1578c688dc9Sreyk dec %rsi 1588c688dc9Sreyk or %rsi,%rdx /* low bytes now non-zero */ 1598c688dc9Sreyk jmp 2b 160*28c67577SguentherEND(strlen) 161