137c9f0a6Schristos/* 237c9f0a6Schristos * Written by J.T. Conklin <jtc@acorntoolworks.com> 337c9f0a6Schristos * Public domain. 437c9f0a6Schristos */ 537c9f0a6Schristos 637c9f0a6Schristos#include <machine/asm.h> 737c9f0a6Schristos 837c9f0a6Schristos#if defined(LIBC_SCCS) 9*3282d009Sandvar RCSID("$NetBSD: strlen.S,v 1.5 2024/03/30 22:03:39 andvar Exp $") 1037c9f0a6Schristos#endif 1137c9f0a6Schristos 1237c9f0a6SchristosENTRY(strlen) 1337c9f0a6Schristos movl 4(%esp),%eax 1437c9f0a6Schristos 1537c9f0a6Schristos.Lalign: 1637c9f0a6Schristos /* Consider unrolling loop? */ 1737c9f0a6Schristos testb $3,%al 1837c9f0a6Schristos je .Lword_aligned 1937c9f0a6Schristos cmpb $0,(%eax) 2037c9f0a6Schristos je .Ldone 2137c9f0a6Schristos incl %eax 2237c9f0a6Schristos jmp .Lalign 2337c9f0a6Schristos 2437c9f0a6Schristos /* 2537c9f0a6Schristos * There are many well known branch-free sequences which are used 2637c9f0a6Schristos * for determining whether a zero-byte is contained within a word. 2742412bc7Sandvar * These sequences are generally much more efficient than loading 2837c9f0a6Schristos * and comparing each byte individually. 2937c9f0a6Schristos * 3037c9f0a6Schristos * The expression [1,2]: 3137c9f0a6Schristos * 3237c9f0a6Schristos * (1) ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | (x | 0x7f7f7f7f)) 3337c9f0a6Schristos * 3437c9f0a6Schristos * evaluates to a non-zero value if any of the bytes in the 3537c9f0a6Schristos * original word is zero. 3637c9f0a6Schristos * 3737c9f0a6Schristos * It also has the useful property that bytes in the result word 3837c9f0a6Schristos * that correspond to non-zero bytes in the original word have 3937c9f0a6Schristos * the value 0x00, while bytes corresponding to zero bytes have 4037c9f0a6Schristos * the value 0x80. This allows calculation of the first (and 4137c9f0a6Schristos * last) occurrence of a zero byte within the word (useful for C's 4237c9f0a6Schristos * str* primitives) by counting the number of leading (or 4337c9f0a6Schristos * trailing) zeros and dividing the result by 8. On machines 4437c9f0a6Schristos * without (or with slow) clz() / ctz() instructions, testing 4537c9f0a6Schristos * each byte in the result word for zero is necessary. 4637c9f0a6Schristos * 4737c9f0a6Schristos * This typically takes 4 instructions (5 on machines without 4837c9f0a6Schristos * "not-or") not including those needed to load the constant. 4937c9f0a6Schristos * 5037c9f0a6Schristos * 5137c9f0a6Schristos * The expression: 5237c9f0a6Schristos * 5337c9f0a6Schristos * (2) ((x - 0x01010101) & ~x & 0x80808080) 5437c9f0a6Schristos * 5537c9f0a6Schristos * evaluates to a non-zero value if any of the bytes in the 5637c9f0a6Schristos * original word is zero. 5737c9f0a6Schristos * 5837c9f0a6Schristos * On little endian machines, the first byte in the result word 5937c9f0a6Schristos * that corresponds to a zero byte in the original byte is 0x80, 6037c9f0a6Schristos * so clz() can be used as above. On big endian machines, and 6137c9f0a6Schristos * little endian machines without (or with a slow) clz() insn, 6237c9f0a6Schristos * testing each byte in the original for zero is necessary. 6337c9f0a6Schristos * 6437c9f0a6Schristos * This typically takes 3 instructions (4 on machines without 6537c9f0a6Schristos * "and with complement") not including those needed to load 6637c9f0a6Schristos * constants. 6737c9f0a6Schristos * 6837c9f0a6Schristos * 6937c9f0a6Schristos * The expression: 7037c9f0a6Schristos * 7137c9f0a6Schristos * (3) ((x - 0x01010101) & 0x80808080) 7237c9f0a6Schristos * 7337c9f0a6Schristos * always evaluates to a non-zero value if any of the bytes in 7437c9f0a6Schristos * the original word is zero. However, in rare cases, it also 7537c9f0a6Schristos * evaluates to a non-zero value when none of the bytes in the 7637c9f0a6Schristos * original word is zero. 7737c9f0a6Schristos * 7837c9f0a6Schristos * To account for possible false positives, each byte of the 7937c9f0a6Schristos * original word must be checked when the expression evaluates to 8037c9f0a6Schristos * a non-zero value. However, because it is simpler than those 8137c9f0a6Schristos * presented above, code that uses it will be faster as long as 8237c9f0a6Schristos * the rate of false positives is low. 8337c9f0a6Schristos * 8450d90726Sandvar * This is likely, because the false positive can only occur 8537c9f0a6Schristos * if the most siginificant bit of a byte within the word is set. 8637c9f0a6Schristos * The expression will never fail for typical 7-bit ASCII strings. 8737c9f0a6Schristos * 8837c9f0a6Schristos * This typically takes 2 instructions not including those needed 8937c9f0a6Schristos * to load constants. 9037c9f0a6Schristos * 9137c9f0a6Schristos * 92*3282d009Sandvar * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Wesley 2003 9337c9f0a6Schristos * 9437c9f0a6Schristos * [2] International Business Machines, "The PowerPC Compiler Writer's 9537c9f0a6Schristos * Guide", Warthman Associates, 1996 9637c9f0a6Schristos */ 9737c9f0a6Schristos 9837c9f0a6Schristos _ALIGN_TEXT 9937c9f0a6Schristos.Lword_aligned: 10037c9f0a6Schristos.Lloop: 10137c9f0a6Schristos movl (%eax),%ecx 10237c9f0a6Schristos addl $4,%eax 10337c9f0a6Schristos leal -0x01010101(%ecx),%edx 10437c9f0a6Schristos testl $0x80808080,%edx 10537c9f0a6Schristos je .Lloop 10637c9f0a6Schristos 10737c9f0a6Schristos /* 10837c9f0a6Schristos * In rare cases, the above loop may exit prematurely. We must 10937c9f0a6Schristos * return to the loop if none of the bytes in the word equal 0. 11037c9f0a6Schristos */ 11137c9f0a6Schristos 11237c9f0a6Schristos /* 11337c9f0a6Schristos * The optimal code for determining whether each byte is zero 11437c9f0a6Schristos * differs by processor. This space-optimized code should be 11537c9f0a6Schristos * acceptable on all, especially since we don't expect it to 11637c9f0a6Schristos * be run frequently, 11737c9f0a6Schristos */ 11837c9f0a6Schristos 11937c9f0a6Schristos testb %cl,%cl /* 1st byte == 0? */ 12037c9f0a6Schristos jne 1f 12137c9f0a6Schristos subl $4,%eax 12237c9f0a6Schristos jmp .Ldone 12337c9f0a6Schristos 12437c9f0a6Schristos1: testb %ch,%ch /* 2nd byte == 0? */ 12537c9f0a6Schristos jne 1f 12637c9f0a6Schristos subl $3,%eax 12737c9f0a6Schristos jmp .Ldone 12837c9f0a6Schristos 12937c9f0a6Schristos1: shrl $16,%ecx 13037c9f0a6Schristos testb %cl,%cl /* 3rd byte == 0? */ 13137c9f0a6Schristos jne 1f 13237c9f0a6Schristos subl $2,%eax 13337c9f0a6Schristos jmp .Ldone 13437c9f0a6Schristos 13537c9f0a6Schristos1: testb %ch,%ch /* 4th byte == 0? */ 13637c9f0a6Schristos jne .Lloop 13737c9f0a6Schristos decl %eax 13837c9f0a6Schristos 13937c9f0a6Schristos.Ldone: 14037c9f0a6Schristos subl 4(%esp),%eax 14137c9f0a6Schristos ret 1422c56941eSjakllschEND(strlen) 143