1*d771b0cbSjsg/* $OpenBSD: strlen.S,v 1.9 2022/01/11 09:21:34 jsg Exp $ */ 2c2a49701Sreyk/* $NetBSD: strlen.S,v 1.6 2014/03/22 19:16:34 jakllsch Exp $ */ 3c2a49701Sreyk 4c2a49701Sreyk/*- 5c2a49701Sreyk * Copyright (c) 2009 The NetBSD Foundation, Inc. 6c2a49701Sreyk * All rights reserved. 7c2a49701Sreyk * 8c2a49701Sreyk * This code is derived from software contributed to The NetBSD Foundation 9c2a49701Sreyk * by David Laight. 10c2a49701Sreyk * 11c2a49701Sreyk * Redistribution and use in source and binary forms, with or without 12c2a49701Sreyk * modification, are permitted provided that the following conditions 13c2a49701Sreyk * are met: 14c2a49701Sreyk * 1. Redistributions of source code must retain the above copyright 15c2a49701Sreyk * notice, this list of conditions and the following disclaimer. 16c2a49701Sreyk * 2. Redistributions in binary form must reproduce the above copyright 17c2a49701Sreyk * notice, this list of conditions and the following disclaimer in the 18c2a49701Sreyk * documentation and/or other materials provided with the distribution. 19c2a49701Sreyk * 20c2a49701Sreyk * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21c2a49701Sreyk * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22c2a49701Sreyk * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23c2a49701Sreyk * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24c2a49701Sreyk * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25c2a49701Sreyk * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26c2a49701Sreyk * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27c2a49701Sreyk * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28c2a49701Sreyk * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29c2a49701Sreyk * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30c2a49701Sreyk * POSSIBILITY OF SUCH DAMAGE. 31c2a49701Sreyk */ 32c2a49701Sreyk 33c2a49701Sreyk/* 34c2a49701Sreyk * Inspired by a version written by J.T. Conklin <jtc@acorntoolworks.com> 35c2a49701Sreyk * (Only the long comment really remains his work!) 36c2a49701Sreyk */ 37c2a49701Sreyk 38b4551fdeSguenther#include "DEFS.h" 39c2a49701Sreyk 40c2a49701Sreyk/* 41c2a49701Sreyk * There are many well known branch-free sequences which are used 42c2a49701Sreyk * for determining whether a zero-byte is contained within a word. 43*d771b0cbSjsg * These sequences are generally much more efficient than loading 44c2a49701Sreyk * and comparing each byte individually. 45c2a49701Sreyk * 46c2a49701Sreyk * The expression [1,2]: 47c2a49701Sreyk * 48c2a49701Sreyk * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f)) 49c2a49701Sreyk * 50c2a49701Sreyk * evaluates to a non-zero value if any of the bytes in the 51c2a49701Sreyk * original word is zero. 52c2a49701Sreyk * 53c2a49701Sreyk * It also has the useful property that bytes in the result word 54c2a49701Sreyk * that correspond to non-zero bytes in the original word have 55c2a49701Sreyk * the value 0x00, while bytes corresponding to zero bytes have 56c2a49701Sreyk * the value 0x80. This allows calculation of the first (and 57c2a49701Sreyk * last) occurrence of a zero byte within the word (useful for C's 58c2a49701Sreyk * str* primitives) by counting the number of leading (or 59c2a49701Sreyk * trailing) zeros and dividing the result by 8. On machines 60c2a49701Sreyk * without (or with slow) clz() / ctz() instructions, testing 61c2a49701Sreyk * each byte in the result word for zero is necessary. 62c2a49701Sreyk * 63c2a49701Sreyk * This typically takes 4 instructions (5 on machines without 64c2a49701Sreyk * "not-or") not including those needed to load the constant. 65c2a49701Sreyk * 66c2a49701Sreyk * 67c2a49701Sreyk * The expression: 68c2a49701Sreyk * 69c2a49701Sreyk * (2) ((x - 0x01....01) & 0x80....80 & ~x) 70c2a49701Sreyk * 71c2a49701Sreyk * evaluates to a non-zero value if any of the bytes in the 72c2a49701Sreyk * original word is zero. 73c2a49701Sreyk * 74c2a49701Sreyk * On little endian machines, the first byte in the result word 75c2a49701Sreyk * that corresponds to a zero byte in the original byte is 0x80, 76c2a49701Sreyk * so clz() can be used as above. On big endian machines, and 77c2a49701Sreyk * little endian machines without (or with a slow) clz() insn, 78c2a49701Sreyk * testing each byte in the original for zero is necessary. 79c2a49701Sreyk * 80c2a49701Sreyk * This typically takes 3 instructions (4 on machines without 81c2a49701Sreyk * "and with complement") not including those needed to load 82c2a49701Sreyk * constants. 83c2a49701Sreyk * 84c2a49701Sreyk * 85c2a49701Sreyk * The expression: 86c2a49701Sreyk * 87c2a49701Sreyk * (3) ((x - 0x01....01) & 0x80....80) 88c2a49701Sreyk * 89c2a49701Sreyk * always evaluates to a non-zero value if any of the bytes in 90c2a49701Sreyk * the original word is zero or has the top bit set. 91c2a49701Sreyk * For strings that are likely to only contain 7-bit ascii these 92c2a49701Sreyk * false positives will be rare. 93c2a49701Sreyk * 94c2a49701Sreyk * To account for possible false positives, each byte of the 95c2a49701Sreyk * original word must be checked when the expression evaluates to 96c2a49701Sreyk * a non-zero value. However, because it is simpler than those 97c2a49701Sreyk * presented above, code that uses it will be faster as long as 98c2a49701Sreyk * the rate of false positives is low. 99c2a49701Sreyk * 100aa753eebSmmcc * This is likely, because the false positive can only occur 101*d771b0cbSjsg * if the most significant bit of a byte within the word is set. 102c2a49701Sreyk * The expression will never fail for typical 7-bit ASCII strings. 103c2a49701Sreyk * 104c2a49701Sreyk * This typically takes 2 instructions not including those needed 105c2a49701Sreyk * to load constants. 106c2a49701Sreyk * 107c2a49701Sreyk * 108c2a49701Sreyk * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 109c2a49701Sreyk * 110c2a49701Sreyk * [2] International Business Machines, "The PowerPC Compiler Writer's 111c2a49701Sreyk * Guide", Warthman Associates, 1996 112c2a49701Sreyk */ 113c2a49701Sreyk 114c2a49701SreykENTRY(strlen) 115631951aaSmortimer RETGUARD_SETUP(strlen, r11) 116c2a49701Sreyk movabsq $0x0101010101010101,%r8 117c2a49701Sreyk 118c2a49701Sreyk test $7,%dil 119c2a49701Sreyk movq %rdi,%rax /* Buffer, %rdi unchanged */ 120c2a49701Sreyk movabsq $0x8080808080808080,%r9 121c2a49701Sreyk jnz 10f /* Jump if misaligned */ 122c2a49701Sreyk 123c2a49701Sreyk _ALIGN_TEXT 124c2a49701Sreyk1: 125c2a49701Sreyk movq (%rax),%rdx /* get bytes to check */ 126c2a49701Sreyk2: 127c2a49701Sreyk addq $8,%rax 128c2a49701Sreyk mov %rdx,%rcx /* save for later check */ 129c2a49701Sreyk subq %r8,%rdx /* alg (3) above first */ 130c2a49701Sreyk not %rcx /* Invert of data */ 131c2a49701Sreyk andq %r9,%rdx 132c2a49701Sreyk je 1b /* jump if all 0x01-0x80 */ 133c2a49701Sreyk 134c2a49701Sreyk /* Do check from alg (2) above - loops for 0x81..0xff bytes */ 135c2a49701Sreyk andq %rcx,%rdx 136c2a49701Sreyk je 1b 137c2a49701Sreyk 138c2a49701Sreyk /* Since we are LE, use bit scan for first 0x80 byte */ 139c2a49701Sreyk sub %rdi,%rax /* length to next word */ 140c2a49701Sreyk bsf %rdx,%rdx /* 7, 15, 23 ... 63 */ 141c2a49701Sreyk shr $3,%rdx /* 0, 1, 2 ... 7 */ 142c2a49701Sreyk lea -8(%rax,%rdx),%rax 143631951aaSmortimer RETGUARD_CHECK(strlen, r11) 144c2a49701Sreyk ret 145c2a49701Sreyk 146c2a49701Sreyk/* Misaligned, read aligned word and make low bytes non-zero */ 147c2a49701Sreyk _ALIGN_TEXT 148c2a49701Sreyk10: 149c2a49701Sreyk mov %al,%cl 150c2a49701Sreyk mov $1,%rsi 151c2a49701Sreyk and $7,%cl /* offset into word 1..7 */ 152c2a49701Sreyk and $~7,%al /* start of word with buffer */ 153c2a49701Sreyk shl $3,%cl /* bit count 8, 16 .. 56 */ 154c2a49701Sreyk movq (%rax),%rdx /* first data in high bytes */ 155c2a49701Sreyk shl %cl,%rsi 156c2a49701Sreyk dec %rsi 157c2a49701Sreyk or %rsi,%rdx /* low bytes now non-zero */ 158c2a49701Sreyk jmp 2b 1599b9d2a55SguentherEND_STRONG(strlen) 160