xref: /openbsd-src/lib/libc/arch/amd64/string/strlen.S (revision d771b0cb749d26db6eecd347aa78a43eaf00fa24)
1*d771b0cbSjsg/*	$OpenBSD: strlen.S,v 1.9 2022/01/11 09:21:34 jsg Exp $	*/
2c2a49701Sreyk/*	$NetBSD: strlen.S,v 1.6 2014/03/22 19:16:34 jakllsch Exp $	*/
3c2a49701Sreyk
4c2a49701Sreyk/*-
5c2a49701Sreyk * Copyright (c) 2009 The NetBSD Foundation, Inc.
6c2a49701Sreyk * All rights reserved.
7c2a49701Sreyk *
8c2a49701Sreyk * This code is derived from software contributed to The NetBSD Foundation
9c2a49701Sreyk * by David Laight.
10c2a49701Sreyk *
11c2a49701Sreyk * Redistribution and use in source and binary forms, with or without
12c2a49701Sreyk * modification, are permitted provided that the following conditions
13c2a49701Sreyk * are met:
14c2a49701Sreyk * 1. Redistributions of source code must retain the above copyright
15c2a49701Sreyk *    notice, this list of conditions and the following disclaimer.
16c2a49701Sreyk * 2. Redistributions in binary form must reproduce the above copyright
17c2a49701Sreyk *    notice, this list of conditions and the following disclaimer in the
18c2a49701Sreyk *    documentation and/or other materials provided with the distribution.
19c2a49701Sreyk *
20c2a49701Sreyk * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21c2a49701Sreyk * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22c2a49701Sreyk * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23c2a49701Sreyk * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24c2a49701Sreyk * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25c2a49701Sreyk * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26c2a49701Sreyk * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27c2a49701Sreyk * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28c2a49701Sreyk * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29c2a49701Sreyk * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30c2a49701Sreyk * POSSIBILITY OF SUCH DAMAGE.
31c2a49701Sreyk */
32c2a49701Sreyk
33c2a49701Sreyk/*
34c2a49701Sreyk * Inspired by a version written by J.T. Conklin <jtc@acorntoolworks.com>
35c2a49701Sreyk * (Only the long comment really remains his work!)
36c2a49701Sreyk */
37c2a49701Sreyk
38b4551fdeSguenther#include "DEFS.h"
39c2a49701Sreyk
40c2a49701Sreyk/*
41c2a49701Sreyk * There are many well known branch-free sequences which are used
42c2a49701Sreyk * for determining whether a zero-byte is contained within a word.
43*d771b0cbSjsg * These sequences are generally much more efficient than loading
44c2a49701Sreyk * and comparing each byte individually.
45c2a49701Sreyk *
46c2a49701Sreyk * The expression [1,2]:
47c2a49701Sreyk *
48c2a49701Sreyk * (1)  ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
49c2a49701Sreyk *
50c2a49701Sreyk * evaluates to a non-zero value if any of the bytes in the
51c2a49701Sreyk * original word is zero.
52c2a49701Sreyk *
53c2a49701Sreyk * It also has the useful property that bytes in the result word
54c2a49701Sreyk * that correspond to non-zero bytes in the original word have
55c2a49701Sreyk * the value 0x00, while bytes corresponding to zero bytes have
56c2a49701Sreyk * the value 0x80. This allows calculation of the first (and
57c2a49701Sreyk * last) occurrence of a zero byte within the word (useful for C's
58c2a49701Sreyk * str* primitives) by counting the number of leading (or
59c2a49701Sreyk * trailing) zeros and dividing the result by 8.  On machines
60c2a49701Sreyk * without (or with slow) clz() / ctz() instructions, testing
61c2a49701Sreyk * each byte in the result word for zero is necessary.
62c2a49701Sreyk *
63c2a49701Sreyk * This typically takes 4 instructions (5 on machines without
64c2a49701Sreyk * "not-or") not including those needed to load the constant.
65c2a49701Sreyk *
66c2a49701Sreyk *
67c2a49701Sreyk * The expression:
68c2a49701Sreyk *
69c2a49701Sreyk * (2)  ((x - 0x01....01) & 0x80....80 & ~x)
70c2a49701Sreyk *
71c2a49701Sreyk * evaluates to a non-zero value if any of the bytes in the
72c2a49701Sreyk * original word is zero.
73c2a49701Sreyk *
74c2a49701Sreyk * On little endian machines, the first byte in the result word
75c2a49701Sreyk * that corresponds to a zero byte in the original byte is 0x80,
76c2a49701Sreyk * so clz() can be used as above.  On big endian machines, and
77c2a49701Sreyk * little endian machines without (or with a slow) clz() insn,
78c2a49701Sreyk * testing each byte in the original for zero is necessary.
79c2a49701Sreyk *
80c2a49701Sreyk * This typically takes 3 instructions (4 on machines without
81c2a49701Sreyk * "and with complement") not including those needed to load
82c2a49701Sreyk * constants.
83c2a49701Sreyk *
84c2a49701Sreyk *
85c2a49701Sreyk * The expression:
86c2a49701Sreyk *
87c2a49701Sreyk * (3)  ((x - 0x01....01) & 0x80....80)
88c2a49701Sreyk *
89c2a49701Sreyk * always evaluates to a non-zero value if any of the bytes in
90c2a49701Sreyk * the original word is zero or has the top bit set.
91c2a49701Sreyk * For strings that are likely to only contain 7-bit ascii these
92c2a49701Sreyk * false positives will be rare.
93c2a49701Sreyk *
94c2a49701Sreyk * To account for possible false positives, each byte of the
95c2a49701Sreyk * original word must be checked when the expression evaluates to
96c2a49701Sreyk * a non-zero value.  However, because it is simpler than those
97c2a49701Sreyk * presented above, code that uses it will be faster as long as
98c2a49701Sreyk * the rate of false positives is low.
99c2a49701Sreyk *
100aa753eebSmmcc * This is likely, because the false positive can only occur
101*d771b0cbSjsg * if the most significant bit of a byte within the word is set.
102c2a49701Sreyk * The expression will never fail for typical 7-bit ASCII strings.
103c2a49701Sreyk *
104c2a49701Sreyk * This typically takes 2 instructions not including those needed
105c2a49701Sreyk * to load constants.
106c2a49701Sreyk *
107c2a49701Sreyk *
108c2a49701Sreyk * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
109c2a49701Sreyk *
110c2a49701Sreyk * [2] International Business Machines, "The PowerPC Compiler Writer's
111c2a49701Sreyk *     Guide", Warthman Associates, 1996
112c2a49701Sreyk */
113c2a49701Sreyk
114c2a49701SreykENTRY(strlen)
115631951aaSmortimer	RETGUARD_SETUP(strlen, r11)
116c2a49701Sreyk	movabsq	$0x0101010101010101,%r8
117c2a49701Sreyk
118c2a49701Sreyk	test	$7,%dil
119c2a49701Sreyk	movq	%rdi,%rax		/* Buffer, %rdi unchanged */
120c2a49701Sreyk	movabsq	$0x8080808080808080,%r9
121c2a49701Sreyk	jnz	10f			/* Jump if misaligned */
122c2a49701Sreyk
123c2a49701Sreyk	_ALIGN_TEXT
124c2a49701Sreyk1:
125c2a49701Sreyk	movq	(%rax),%rdx		/* get bytes to check */
126c2a49701Sreyk2:
127c2a49701Sreyk	addq	$8,%rax
128c2a49701Sreyk	mov	%rdx,%rcx		/* save for later check */
129c2a49701Sreyk	subq	%r8,%rdx		/* alg (3) above first */
130c2a49701Sreyk	not	%rcx			/* Invert of data */
131c2a49701Sreyk	andq	%r9,%rdx
132c2a49701Sreyk	je	1b			/* jump if all 0x01-0x80 */
133c2a49701Sreyk
134c2a49701Sreyk	/* Do check from alg (2) above - loops for 0x81..0xff bytes */
135c2a49701Sreyk	andq	%rcx,%rdx
136c2a49701Sreyk	je	1b
137c2a49701Sreyk
138c2a49701Sreyk	/* Since we are LE, use bit scan for first 0x80 byte */
139c2a49701Sreyk	sub	%rdi,%rax		/* length to next word */
140c2a49701Sreyk	bsf	%rdx,%rdx		/* 7, 15, 23 ... 63 */
141c2a49701Sreyk	shr	$3,%rdx			/* 0, 1, 2 ... 7 */
142c2a49701Sreyk	lea	-8(%rax,%rdx),%rax
143631951aaSmortimer	RETGUARD_CHECK(strlen, r11)
144c2a49701Sreyk	ret
145c2a49701Sreyk
146c2a49701Sreyk/* Misaligned, read aligned word and make low bytes non-zero */
147c2a49701Sreyk	_ALIGN_TEXT
148c2a49701Sreyk10:
149c2a49701Sreyk	mov	%al,%cl
150c2a49701Sreyk	mov	$1,%rsi
151c2a49701Sreyk	and	$7,%cl			/* offset into word 1..7 */
152c2a49701Sreyk	and	$~7,%al			/* start of word with buffer */
153c2a49701Sreyk	shl	$3,%cl			/* bit count 8, 16 .. 56 */
154c2a49701Sreyk	movq	(%rax),%rdx		/* first data in high bytes */
155c2a49701Sreyk	shl	%cl,%rsi
156c2a49701Sreyk	dec	%rsi
157c2a49701Sreyk	or	%rsi,%rdx		/* low bytes now non-zero */
158c2a49701Sreyk	jmp	2b
1599b9d2a55SguentherEND_STRONG(strlen)
160