1b6cbf720SGianluca Guida/* 2b6cbf720SGianluca Guida * Written by J.T. Conklin <jtc@acorntoolworks.com> 3b6cbf720SGianluca Guida * Public domain. 4b6cbf720SGianluca Guida */ 5b6cbf720SGianluca Guida 6b6cbf720SGianluca Guida#include <machine/asm.h> 7b6cbf720SGianluca Guida 8b6cbf720SGianluca Guida#if defined(LIBC_SCCS) 9*0a6a1f1dSLionel Sambuc RCSID("$NetBSD: strlen.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $") 10b6cbf720SGianluca Guida#endif 11b6cbf720SGianluca Guida 12b6cbf720SGianluca GuidaENTRY(strlen) 13b6cbf720SGianluca Guida movl 4(%esp),%eax 14b6cbf720SGianluca Guida 15b6cbf720SGianluca Guida.Lalign: 16b6cbf720SGianluca Guida /* Consider unrolling loop? */ 17b6cbf720SGianluca Guida testb $3,%al 18b6cbf720SGianluca Guida je .Lword_aligned 19b6cbf720SGianluca Guida cmpb $0,(%eax) 20b6cbf720SGianluca Guida je .Ldone 21b6cbf720SGianluca Guida incl %eax 22b6cbf720SGianluca Guida jmp .Lalign 23b6cbf720SGianluca Guida 24b6cbf720SGianluca Guida /* 25b6cbf720SGianluca Guida * There are many well known branch-free sequences which are used 26b6cbf720SGianluca Guida * for determining whether a zero-byte is contained within a word. 27b6cbf720SGianluca Guida * These sequences are generally much more efficent than loading 28b6cbf720SGianluca Guida * and comparing each byte individually. 29b6cbf720SGianluca Guida * 30b6cbf720SGianluca Guida * The expression [1,2]: 31b6cbf720SGianluca Guida * 32b6cbf720SGianluca Guida * (1) ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | (x | 0x7f7f7f7f)) 33b6cbf720SGianluca Guida * 34b6cbf720SGianluca Guida * evaluates to a non-zero value if any of the bytes in the 35b6cbf720SGianluca Guida * original word is zero. 36b6cbf720SGianluca Guida * 37b6cbf720SGianluca Guida * It also has the useful property that bytes in the result word 38b6cbf720SGianluca Guida * that correspond to non-zero bytes in the original word have 39b6cbf720SGianluca Guida * the value 0x00, while bytes corresponding to zero bytes have 40b6cbf720SGianluca Guida * the value 0x80. This allows calculation of the first (and 41b6cbf720SGianluca Guida * last) occurrence of a zero byte within the word (useful for C's 42b6cbf720SGianluca Guida * str* primitives) by counting the number of leading (or 43b6cbf720SGianluca Guida * trailing) zeros and dividing the result by 8. On machines 44b6cbf720SGianluca Guida * without (or with slow) clz() / ctz() instructions, testing 45b6cbf720SGianluca Guida * each byte in the result word for zero is necessary. 46b6cbf720SGianluca Guida * 47b6cbf720SGianluca Guida * This typically takes 4 instructions (5 on machines without 48b6cbf720SGianluca Guida * "not-or") not including those needed to load the constant. 49b6cbf720SGianluca Guida * 50b6cbf720SGianluca Guida * 51b6cbf720SGianluca Guida * The expression: 52b6cbf720SGianluca Guida * 53b6cbf720SGianluca Guida * (2) ((x - 0x01010101) & ~x & 0x80808080) 54b6cbf720SGianluca Guida * 55b6cbf720SGianluca Guida * evaluates to a non-zero value if any of the bytes in the 56b6cbf720SGianluca Guida * original word is zero. 57b6cbf720SGianluca Guida * 58b6cbf720SGianluca Guida * On little endian machines, the first byte in the result word 59b6cbf720SGianluca Guida * that corresponds to a zero byte in the original byte is 0x80, 60b6cbf720SGianluca Guida * so clz() can be used as above. On big endian machines, and 61b6cbf720SGianluca Guida * little endian machines without (or with a slow) clz() insn, 62b6cbf720SGianluca Guida * testing each byte in the original for zero is necessary. 63b6cbf720SGianluca Guida * 64b6cbf720SGianluca Guida * This typically takes 3 instructions (4 on machines without 65b6cbf720SGianluca Guida * "and with complement") not including those needed to load 66b6cbf720SGianluca Guida * constants. 67b6cbf720SGianluca Guida * 68b6cbf720SGianluca Guida * 69b6cbf720SGianluca Guida * The expression: 70b6cbf720SGianluca Guida * 71b6cbf720SGianluca Guida * (3) ((x - 0x01010101) & 0x80808080) 72b6cbf720SGianluca Guida * 73b6cbf720SGianluca Guida * always evaluates to a non-zero value if any of the bytes in 74b6cbf720SGianluca Guida * the original word is zero. However, in rare cases, it also 75b6cbf720SGianluca Guida * evaluates to a non-zero value when none of the bytes in the 76b6cbf720SGianluca Guida * original word is zero. 77b6cbf720SGianluca Guida * 78b6cbf720SGianluca Guida * To account for possible false positives, each byte of the 79b6cbf720SGianluca Guida * original word must be checked when the expression evaluates to 80b6cbf720SGianluca Guida * a non-zero value. However, because it is simpler than those 81b6cbf720SGianluca Guida * presented above, code that uses it will be faster as long as 82b6cbf720SGianluca Guida * the rate of false positives is low. 83b6cbf720SGianluca Guida * 84b6cbf720SGianluca Guida * This is likely, because the the false positive can only occur 85b6cbf720SGianluca Guida * if the most siginificant bit of a byte within the word is set. 86b6cbf720SGianluca Guida * The expression will never fail for typical 7-bit ASCII strings. 87b6cbf720SGianluca Guida * 88b6cbf720SGianluca Guida * This typically takes 2 instructions not including those needed 89b6cbf720SGianluca Guida * to load constants. 90b6cbf720SGianluca Guida * 91b6cbf720SGianluca Guida * 92b6cbf720SGianluca Guida * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 93b6cbf720SGianluca Guida * 94b6cbf720SGianluca Guida * [2] International Business Machines, "The PowerPC Compiler Writer's 95b6cbf720SGianluca Guida * Guide", Warthman Associates, 1996 96b6cbf720SGianluca Guida */ 97b6cbf720SGianluca Guida 98b6cbf720SGianluca Guida _ALIGN_TEXT 99b6cbf720SGianluca Guida.Lword_aligned: 100b6cbf720SGianluca Guida.Lloop: 101b6cbf720SGianluca Guida movl (%eax),%ecx 102b6cbf720SGianluca Guida addl $4,%eax 103b6cbf720SGianluca Guida leal -0x01010101(%ecx),%edx 104b6cbf720SGianluca Guida testl $0x80808080,%edx 105b6cbf720SGianluca Guida je .Lloop 106b6cbf720SGianluca Guida 107b6cbf720SGianluca Guida /* 108b6cbf720SGianluca Guida * In rare cases, the above loop may exit prematurely. We must 109b6cbf720SGianluca Guida * return to the loop if none of the bytes in the word equal 0. 110b6cbf720SGianluca Guida */ 111b6cbf720SGianluca Guida 112b6cbf720SGianluca Guida /* 113b6cbf720SGianluca Guida * The optimal code for determining whether each byte is zero 114b6cbf720SGianluca Guida * differs by processor. This space-optimized code should be 115b6cbf720SGianluca Guida * acceptable on all, especially since we don't expect it to 116b6cbf720SGianluca Guida * be run frequently, 117b6cbf720SGianluca Guida */ 118b6cbf720SGianluca Guida 119b6cbf720SGianluca Guida testb %cl,%cl /* 1st byte == 0? */ 120b6cbf720SGianluca Guida jne 1f 121b6cbf720SGianluca Guida subl $4,%eax 122b6cbf720SGianluca Guida jmp .Ldone 123b6cbf720SGianluca Guida 124b6cbf720SGianluca Guida1: testb %ch,%ch /* 2nd byte == 0? */ 125b6cbf720SGianluca Guida jne 1f 126b6cbf720SGianluca Guida subl $3,%eax 127b6cbf720SGianluca Guida jmp .Ldone 128b6cbf720SGianluca Guida 129b6cbf720SGianluca Guida1: shrl $16,%ecx 130b6cbf720SGianluca Guida testb %cl,%cl /* 3rd byte == 0? */ 131b6cbf720SGianluca Guida jne 1f 132b6cbf720SGianluca Guida subl $2,%eax 133b6cbf720SGianluca Guida jmp .Ldone 134b6cbf720SGianluca Guida 135b6cbf720SGianluca Guida1: testb %ch,%ch /* 4th byte == 0? */ 136b6cbf720SGianluca Guida jne .Lloop 137b6cbf720SGianluca Guida decl %eax 138b6cbf720SGianluca Guida 139b6cbf720SGianluca Guida.Ldone: 140b6cbf720SGianluca Guida subl 4(%esp),%eax 141b6cbf720SGianluca Guida ret 142*0a6a1f1dSLionel SambucEND(strlen) 143