1/* 2 * Written by J.T. Conklin <jtc@acorntoolworks.com> 3 * Public domain. 4 */ 5 6#include <machine/asm.h> 7 8#if defined(LIBC_SCCS) 9 RCSID("$NetBSD: strlen.S,v 1.1 2005/12/20 19:28:51 christos Exp $") 10#endif 11 12ENTRY(strlen) 13 movq %rdi,%rax 14 negq %rdi 15 16.Lalign: 17 /* Consider unrolling loop? */ 18 testb $7,%al 19 je .Lword_aligned 20 cmpb $0,(%rax) 21 jne 1f 22 leaq (%rdi,%rax),%rax 23 ret 241: incq %rax 25 jmp .Lalign 26 27 /* 28 * There are many well known branch-free sequences which are used 29 * for determining whether a zero-byte is contained within a word. 30 * These sequences are generally much more efficent than loading 31 * and comparing each byte individually. 32 * 33 * The expression [1,2]: 34 * 35 * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f)) 36 * 37 * evaluates to a non-zero value if any of the bytes in the 38 * original word is zero. 39 * 40 * It also has the useful property that bytes in the result word 41 * that correspond to non-zero bytes in the original word have 42 * the value 0x00, while bytes corresponding to zero bytes have 43 * the value 0x80. This allows calculation of the first (and 44 * last) occurrence of a zero byte within the word (useful for C's 45 * str* primitives) by counting the number of leading (or 46 * trailing) zeros and dividing the result by 8. On machines 47 * without (or with slow) clz() / ctz() instructions, testing 48 * each byte in the result word for zero is necessary. 49 * 50 * This typically takes 4 instructions (5 on machines without 51 * "not-or") not including those needed to load the constant. 52 * 53 * 54 * The expression: 55 * 56 * (2) ((x - 0x01....01) & ~x & 0x80....80) 57 * 58 * evaluates to a non-zero value if any of the bytes in the 59 * original word is zero. 60 * 61 * On little endian machines, the first byte in the result word 62 * that corresponds to a zero byte in the original byte is 0x80, 63 * so clz() can be used as above. On big endian machines, and 64 * little endian machines without (or with a slow) clz() insn, 65 * testing each byte in the original for zero is necessary. 66 * 67 * This typically takes 3 instructions (4 on machines without 68 * "and with complement") not including those needed to load 69 * constants. 70 * 71 * 72 * The expression: 73 * 74 * (3) ((x - 0x01....01) & 0x80....80) 75 * 76 * always evaluates to a non-zero value if any of the bytes in 77 * the original word is zero. However, in rare cases, it also 78 * evaluates to a non-zero value when none of the bytes in the 79 * original word is zero. 80 * 81 * To account for possible false positives, each byte of the 82 * original word must be checked when the expression evaluates to 83 * a non-zero value. However, because it is simpler than those 84 * presented above, code that uses it will be faster as long as 85 * the rate of false positives is low. 86 * 87 * This is likely, because the the false positive can only occur 88 * if the most siginificant bit of a byte within the word is set. 89 * The expression will never fail for typical 7-bit ASCII strings. 90 * 91 * This typically takes 2 instructions not including those needed 92 * to load constants. 93 * 94 * 95 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 96 * 97 * [2] International Business Machines, "The PowerPC Compiler Writer's 98 * Guide", Warthman Associates, 1996 99 */ 100 101 _ALIGN_TEXT 102.Lword_aligned: 103 movabsq $0x0101010101010101,%r8 104 movabsq $0x8080808080808080,%r9 105.Lloop: 106 movq (%rax),%rdx 107 addq $8,%rax 108 subq %r8,%rdx 109 testq %r9,%rdx 110 je .Lloop 111 112 /* 113 * In rare cases, the above loop may exit prematurely. We must 114 * return to the loop if none of the bytes in the word equal 0. 115 */ 116 cmpb $0,-8(%rax) /* 1st byte == 0? */ 117 je .Lsub8 118 cmpb $0,-7(%rax) /* 2nd byte == 0? */ 119 je .Lsub7 120 cmpb $0,-6(%rax) /* 3rd byte == 0? */ 121 je .Lsub6 122 cmpb $0,-5(%rax) /* 4th byte == 0? */ 123 je .Lsub5 124 cmpb $0,-4(%rax) /* 5th byte == 0? */ 125 je .Lsub4 126 cmpb $0,-3(%rax) /* 6th byte == 0? */ 127 je .Lsub3 128 cmpb $0,-2(%rax) /* 7th byte == 0? */ 129 je .Lsub2 130 cmpb $0,-1(%rax) /* 8th byte == 0? */ 131 jne .Lloop 132 133.Lsub1: 134 leaq -1(%rdi,%rax),%rax 135 ret 136.Lsub2: 137 leaq -2(%rdi,%rax),%rax 138 ret 139.Lsub3: 140 leaq -3(%rdi,%rax),%rax 141 ret 142.Lsub4: 143 leaq -4(%rdi,%rax),%rax 144 ret 145.Lsub5: 146 leaq -5(%rdi,%rax),%rax 147 ret 148.Lsub6: 149 leaq -6(%rdi,%rax),%rax 150 ret 151.Lsub7: 152 leaq -7(%rdi,%rax),%rax 153 ret 154.Lsub8: 155 leaq -8(%rdi,%rax),%rax 156 ret 157