xref: /netbsd-src/common/lib/libc/arch/x86_64/string/strlen.S (revision 8b0f9554ff8762542c4defc4f70e1eb76fb508fa)
1/*
2 * Written by J.T. Conklin <jtc@acorntoolworks.com>
3 * Public domain.
4 */
5
6#include <machine/asm.h>
7
8#if defined(LIBC_SCCS)
9	RCSID("$NetBSD: strlen.S,v 1.1 2005/12/20 19:28:51 christos Exp $")
10#endif
11
12ENTRY(strlen)
13	movq	%rdi,%rax
14	negq	%rdi
15
16.Lalign:
17	/* Consider unrolling loop? */
18	testb	$7,%al
19	je	.Lword_aligned
20	cmpb	$0,(%rax)
21	jne	1f
22	leaq	(%rdi,%rax),%rax
23	ret
241:	incq	%rax
25	jmp	.Lalign
26
27	/*
28	 * There are many well known branch-free sequences which are used
29	 * for determining whether a zero-byte is contained within a word.
30	 * These sequences are generally much more efficent than loading
31	 * and comparing each byte individually.
32	 *
33	 * The expression [1,2]:
34	 *
35	 * (1)  ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
36	 *
37	 * evaluates to a non-zero value if any of the bytes in the
38	 * original word is zero.
39	 *
40	 * It also has the useful property that bytes in the result word
41	 * that correspond to non-zero bytes in the original word have
42	 * the value 0x00, while bytes corresponding to zero bytes have
43	 * the value 0x80. This allows calculation of the first (and
44	 * last) occurrence of a zero byte within the word (useful for C's
45	 * str* primitives) by counting the number of leading (or
46	 * trailing) zeros and dividing the result by 8.  On machines
47	 * without (or with slow) clz() / ctz() instructions, testing
48	 * each byte in the result word for zero is necessary.
49	 *
50	 * This typically takes 4 instructions (5 on machines without
51	 * "not-or") not including those needed to load the constant.
52	 *
53	 *
54	 * The expression:
55	 *
56	 * (2)  ((x - 0x01....01) & ~x & 0x80....80)
57	 *
58	 * evaluates to a non-zero value if any of the bytes in the
59	 * original word is zero.
60	 *
61	 * On little endian machines, the first byte in the result word
62	 * that corresponds to a zero byte in the original byte is 0x80,
63	 * so clz() can be used as above.  On big endian machines, and
64	 * little endian machines without (or with a slow) clz() insn,
65	 * testing each byte in the original for zero is necessary.
66	 *
67	 * This typically takes 3 instructions (4 on machines without
68	 * "and with complement") not including those needed to load
69	 * constants.
70	 *
71	 *
72	 * The expression:
73	 *
74	 * (3)  ((x - 0x01....01) & 0x80....80)
75	 *
76	 * always evaluates to a non-zero value if any of the bytes in
77	 * the original word is zero.  However, in rare cases, it also
78	 * evaluates to a non-zero value when none of the bytes in the
79	 * original word is zero.
80	 *
81	 * To account for possible false positives, each byte of the
82	 * original word must be checked when the expression evaluates to
83	 * a non-zero value.  However, because it is simpler than those
84	 * presented above, code that uses it will be faster as long as
85	 * the rate of false positives is low.
86	 *
87	 * This is likely, because the the false positive can only occur
88	 * if the most siginificant bit of a byte within the word is set.
89	 * The expression will never fail for typical 7-bit ASCII strings.
90	 *
91	 * This typically takes 2 instructions not including those needed
92	 * to load constants.
93	 *
94	 *
95	 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
96	 *
97	 * [2] International Business Machines, "The PowerPC Compiler Writer's
98	 *     Guide", Warthman Associates, 1996
99	 */
100
101	_ALIGN_TEXT
102.Lword_aligned:
103	movabsq	$0x0101010101010101,%r8
104	movabsq	$0x8080808080808080,%r9
105.Lloop:
106	movq	(%rax),%rdx
107	addq	$8,%rax
108	subq	%r8,%rdx
109	testq	%r9,%rdx
110	je	.Lloop
111
112	/*
113	 * In rare cases, the above loop may exit prematurely. We must
114	 * return to the loop if none of the bytes in the word equal 0.
115	 */
116	cmpb	$0,-8(%rax)		/* 1st byte == 0? */
117	je	.Lsub8
118	cmpb	$0,-7(%rax)		/* 2nd byte == 0? */
119	je	.Lsub7
120	cmpb	$0,-6(%rax)		/* 3rd byte == 0? */
121	je	.Lsub6
122	cmpb	$0,-5(%rax)		/* 4th byte == 0? */
123	je	.Lsub5
124	cmpb	$0,-4(%rax)		/* 5th byte == 0? */
125	je	.Lsub4
126	cmpb	$0,-3(%rax)		/* 6th byte == 0? */
127	je	.Lsub3
128	cmpb	$0,-2(%rax)		/* 7th byte == 0? */
129	je	.Lsub2
130	cmpb	$0,-1(%rax)		/* 8th byte == 0? */
131	jne	.Lloop
132
133.Lsub1:
134	leaq	-1(%rdi,%rax),%rax
135	ret
136.Lsub2:
137	leaq	-2(%rdi,%rax),%rax
138	ret
139.Lsub3:
140	leaq	-3(%rdi,%rax),%rax
141	ret
142.Lsub4:
143	leaq	-4(%rdi,%rax),%rax
144	ret
145.Lsub5:
146	leaq	-5(%rdi,%rax),%rax
147	ret
148.Lsub6:
149	leaq	-6(%rdi,%rax),%rax
150	ret
151.Lsub7:
152	leaq	-7(%rdi,%rax),%rax
153	ret
154.Lsub8:
155	leaq	-8(%rdi,%rax),%rax
156	ret
157