1/* 2 * strncmp - compare two strings 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64 12 */ 13 14#include "../asmdefs.h" 15 16#define REP8_01 0x0101010101010101 17#define REP8_7f 0x7f7f7f7f7f7f7f7f 18#define REP8_80 0x8080808080808080 19 20/* Parameters and result. */ 21#define src1 x0 22#define src2 x1 23#define limit x2 24#define result x0 25 26/* Internal variables. */ 27#define data1 x3 28#define data1w w3 29#define data2 x4 30#define data2w w4 31#define has_nul x5 32#define diff x6 33#define syndrome x7 34#define tmp1 x8 35#define tmp2 x9 36#define tmp3 x10 37#define zeroones x11 38#define pos x12 39#define limit_wd x13 40#define mask x14 41#define endloop x15 42#define count mask 43 44 .text 45 .p2align 6 46 .rep 7 47 nop /* Pad so that the loop below fits a cache line. */ 48 .endr 49ENTRY_ALIGN (__strncmp_aarch64, 0) 50 cbz limit, L(ret0) 51 eor tmp1, src1, src2 52 mov zeroones, #REP8_01 53 tst tmp1, #7 54 and count, src1, #7 55 b.ne L(misaligned8) 56 cbnz count, L(mutual_align) 57 /* Calculate the number of full and partial words -1. */ 58 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 59 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 60 61 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 62 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 63 can be done in parallel across the entire word. */ 64 /* Start of performance-critical section -- one 64B cache line. */ 65L(loop_aligned): 66 ldr data1, [src1], #8 67 ldr data2, [src2], #8 68L(start_realigned): 69 subs limit_wd, limit_wd, #1 70 sub tmp1, data1, zeroones 71 orr tmp2, data1, #REP8_7f 72 eor diff, data1, data2 /* Non-zero if differences found. */ 73 csinv endloop, diff, xzr, pl /* Last Dword or differences. */ 74 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 75 ccmp endloop, #0, #0, eq 76 b.eq L(loop_aligned) 77 /* End of performance-critical section -- one 64B cache line. */ 78 79 /* Not reached the limit, must have found the end or a diff. */ 80 tbz limit_wd, #63, L(not_limit) 81 82 /* Limit % 8 == 0 => all bytes significant. */ 83 ands limit, limit, #7 84 b.eq L(not_limit) 85 86 lsl limit, limit, #3 /* Bits -> bytes. */ 87 mov mask, #~0 88#ifdef __AARCH64EB__ 89 lsr mask, mask, limit 90#else 91 lsl mask, mask, limit 92#endif 93 bic data1, data1, mask 94 bic data2, data2, mask 95 96 /* Make sure that the NUL byte is marked in the syndrome. */ 97 orr has_nul, has_nul, mask 98 99L(not_limit): 100 orr syndrome, diff, has_nul 101 102#ifndef __AARCH64EB__ 103 rev syndrome, syndrome 104 rev data1, data1 105 /* The MS-non-zero bit of the syndrome marks either the first bit 106 that is different, or the top bit of the first zero byte. 107 Shifting left now will bring the critical information into the 108 top bits. */ 109 clz pos, syndrome 110 rev data2, data2 111 lsl data1, data1, pos 112 lsl data2, data2, pos 113 /* But we need to zero-extend (char is unsigned) the value and then 114 perform a signed 32-bit subtraction. */ 115 lsr data1, data1, #56 116 sub result, data1, data2, lsr #56 117 ret 118#else 119 /* For big-endian we cannot use the trick with the syndrome value 120 as carry-propagation can corrupt the upper bits if the trailing 121 bytes in the string contain 0x01. */ 122 /* However, if there is no NUL byte in the dword, we can generate 123 the result directly. We can't just subtract the bytes as the 124 MSB might be significant. */ 125 cbnz has_nul, 1f 126 cmp data1, data2 127 cset result, ne 128 cneg result, result, lo 129 ret 1301: 131 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 132 rev tmp3, data1 133 sub tmp1, tmp3, zeroones 134 orr tmp2, tmp3, #REP8_7f 135 bic has_nul, tmp1, tmp2 136 rev has_nul, has_nul 137 orr syndrome, diff, has_nul 138 clz pos, syndrome 139 /* The MS-non-zero bit of the syndrome marks either the first bit 140 that is different, or the top bit of the first zero byte. 141 Shifting left now will bring the critical information into the 142 top bits. */ 143 lsl data1, data1, pos 144 lsl data2, data2, pos 145 /* But we need to zero-extend (char is unsigned) the value and then 146 perform a signed 32-bit subtraction. */ 147 lsr data1, data1, #56 148 sub result, data1, data2, lsr #56 149 ret 150#endif 151 152L(mutual_align): 153 /* Sources are mutually aligned, but are not currently at an 154 alignment boundary. Round down the addresses and then mask off 155 the bytes that precede the start point. 156 We also need to adjust the limit calculations, but without 157 overflowing if the limit is near ULONG_MAX. */ 158 bic src1, src1, #7 159 bic src2, src2, #7 160 ldr data1, [src1], #8 161 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 162 ldr data2, [src2], #8 163 mov tmp2, #~0 164 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 165#ifdef __AARCH64EB__ 166 /* Big-endian. Early bytes are at MSB. */ 167 lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ 168#else 169 /* Little-endian. Early bytes are at LSB. */ 170 lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ 171#endif 172 and tmp3, limit_wd, #7 173 lsr limit_wd, limit_wd, #3 174 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ 175 add limit, limit, count 176 add tmp3, tmp3, count 177 orr data1, data1, tmp2 178 orr data2, data2, tmp2 179 add limit_wd, limit_wd, tmp3, lsr #3 180 b L(start_realigned) 181 182 .p2align 6 183 /* Don't bother with dwords for up to 16 bytes. */ 184L(misaligned8): 185 cmp limit, #16 186 b.hs L(try_misaligned_words) 187 188L(byte_loop): 189 /* Perhaps we can do better than this. */ 190 ldrb data1w, [src1], #1 191 ldrb data2w, [src2], #1 192 subs limit, limit, #1 193 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 194 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 195 b.eq L(byte_loop) 196L(done): 197 sub result, data1, data2 198 ret 199 /* Align the SRC1 to a dword by doing a bytewise compare and then do 200 the dword loop. */ 201L(try_misaligned_words): 202 lsr limit_wd, limit, #3 203 cbz count, L(do_misaligned) 204 205 neg count, count 206 and count, count, #7 207 sub limit, limit, count 208 lsr limit_wd, limit, #3 209 210L(page_end_loop): 211 ldrb data1w, [src1], #1 212 ldrb data2w, [src2], #1 213 cmp data1w, #1 214 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 215 b.ne L(done) 216 subs count, count, #1 217 b.hi L(page_end_loop) 218 219L(do_misaligned): 220 /* Prepare ourselves for the next page crossing. Unlike the aligned 221 loop, we fetch 1 less dword because we risk crossing bounds on 222 SRC2. */ 223 mov count, #8 224 subs limit_wd, limit_wd, #1 225 b.lo L(done_loop) 226L(loop_misaligned): 227 and tmp2, src2, #0xff8 228 eor tmp2, tmp2, #0xff8 229 cbz tmp2, L(page_end_loop) 230 231 ldr data1, [src1], #8 232 ldr data2, [src2], #8 233 sub tmp1, data1, zeroones 234 orr tmp2, data1, #REP8_7f 235 eor diff, data1, data2 /* Non-zero if differences found. */ 236 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 237 ccmp diff, #0, #0, eq 238 b.ne L(not_limit) 239 subs limit_wd, limit_wd, #1 240 b.pl L(loop_misaligned) 241 242L(done_loop): 243 /* We found a difference or a NULL before the limit was reached. */ 244 and limit, limit, #7 245 cbz limit, L(not_limit) 246 /* Read the last word. */ 247 sub src1, src1, 8 248 sub src2, src2, 8 249 ldr data1, [src1, limit] 250 ldr data2, [src2, limit] 251 sub tmp1, data1, zeroones 252 orr tmp2, data1, #REP8_7f 253 eor diff, data1, data2 /* Non-zero if differences found. */ 254 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 255 ccmp diff, #0, #0, eq 256 b.ne L(not_limit) 257 258L(ret0): 259 mov result, #0 260 ret 261 262END ( __strncmp_aarch64) 263