1*d8385768SRobert Clausecker/*- 27f06b217SMateusz Guzik * Written by Mateusz Guzik <mjg@freebsd.org> 3*d8385768SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 4*d8385768SRobert Clausecker * 5*d8385768SRobert Clausecker * Portions of this software were developed by Robert Clausecker 6*d8385768SRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 7*d8385768SRobert Clausecker * 87f06b217SMateusz Guzik * Public domain. 97f06b217SMateusz Guzik */ 107f06b217SMateusz Guzik 117f06b217SMateusz Guzik#include <machine/asm.h> 12*d8385768SRobert Clausecker#include "amd64_archlevel.h" 13*d8385768SRobert Clausecker 147f06b217SMateusz Guzik/* 157f06b217SMateusz Guzik * Note: this routine was written with kernel use in mind (read: no simd), 167f06b217SMateusz Guzik * it is only present in userspace as a temporary measure until something 177f06b217SMateusz Guzik * better gets imported. 187f06b217SMateusz Guzik */ 197f06b217SMateusz Guzik 207f06b217SMateusz Guzik#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 217f06b217SMateusz Guzik 22*d8385768SRobert ClauseckerARCHFUNCS(strlen) 23*d8385768SRobert Clausecker ARCHFUNC(strlen, scalar) 24*d8385768SRobert Clausecker ARCHFUNC(strlen, baseline) 25*d8385768SRobert ClauseckerENDARCHFUNCS(strlen) 26*d8385768SRobert Clausecker 277f06b217SMateusz Guzik/* 287f06b217SMateusz Guzik * strlen(string) 297f06b217SMateusz Guzik * %rdi 307f06b217SMateusz Guzik * 317f06b217SMateusz Guzik * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick. 327f06b217SMateusz Guzik * 337f06b217SMateusz Guzik * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added 347f06b217SMateusz Guzik * with leaq. 357f06b217SMateusz Guzik * 367f06b217SMateusz Guzik * For a description see either: 377f06b217SMateusz Guzik * - "Hacker's Delight" by Henry S. Warren, Jr. 387f06b217SMateusz Guzik * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms" 397f06b217SMateusz Guzik * by Agner Fog 407f06b217SMateusz Guzik * 417f06b217SMateusz Guzik * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. 427f06b217SMateusz Guzik */ 43*d8385768SRobert ClauseckerARCHENTRY(strlen, scalar) 447f06b217SMateusz Guzik movabsq $0xfefefefefefefeff,%r8 457f06b217SMateusz Guzik movabsq $0x8080808080808080,%r9 467f06b217SMateusz Guzik 477f06b217SMateusz Guzik movq %rdi,%r10 487f06b217SMateusz Guzik movq %rdi,%rcx 497f06b217SMateusz Guzik testb $7,%dil 507f06b217SMateusz Guzik jz 2f 517f06b217SMateusz Guzik 527f06b217SMateusz Guzik /* 537f06b217SMateusz Guzik * Handle misaligned reads: align to 8 and fill 547f06b217SMateusz Guzik * the spurious bytes. 557f06b217SMateusz Guzik */ 567f06b217SMateusz Guzik andq $~7,%rdi 577f06b217SMateusz Guzik movq (%rdi),%r11 587f06b217SMateusz Guzik shlq $3,%rcx 597f06b217SMateusz Guzik movq $-1,%rdx 607f06b217SMateusz Guzik shlq %cl,%rdx 617f06b217SMateusz Guzik notq %rdx 627f06b217SMateusz Guzik orq %rdx,%r11 637f06b217SMateusz Guzik 647f06b217SMateusz Guzik leaq (%r11,%r8),%rcx 657f06b217SMateusz Guzik notq %r11 667f06b217SMateusz Guzik andq %r11,%rcx 677f06b217SMateusz Guzik andq %r9,%rcx 687f06b217SMateusz Guzik jnz 3f 697f06b217SMateusz Guzik 707f06b217SMateusz Guzik /* 717f06b217SMateusz Guzik * Main loop. 727f06b217SMateusz Guzik */ 737f06b217SMateusz Guzik ALIGN_TEXT 747f06b217SMateusz Guzik1: 757f06b217SMateusz Guzik leaq 8(%rdi),%rdi 767f06b217SMateusz Guzik2: 777f06b217SMateusz Guzik movq (%rdi),%r11 787f06b217SMateusz Guzik leaq (%r11,%r8),%rcx 797f06b217SMateusz Guzik notq %r11 807f06b217SMateusz Guzik andq %r11,%rcx 817f06b217SMateusz Guzik andq %r9,%rcx 827f06b217SMateusz Guzik jz 1b 837f06b217SMateusz Guzik3: 847f06b217SMateusz Guzik bsfq %rcx,%rcx 857f06b217SMateusz Guzik shrq $3,%rcx 867f06b217SMateusz Guzik leaq (%rcx,%rdi),%rax 877f06b217SMateusz Guzik subq %r10,%rax 887f06b217SMateusz Guzik ret 89*d8385768SRobert ClauseckerARCHEND(strlen, scalar) 90*d8385768SRobert Clausecker 91*d8385768SRobert ClauseckerARCHENTRY(strlen, baseline) 92*d8385768SRobert Clausecker mov %rdi, %rcx 93*d8385768SRobert Clausecker pxor %xmm1, %xmm1 94*d8385768SRobert Clausecker and $~0xf, %rdi # align string 95*d8385768SRobert Clausecker pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) 96*d8385768SRobert Clausecker mov %rcx, %rsi # string pointer copy for later 97*d8385768SRobert Clausecker and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment 98*d8385768SRobert Clausecker pmovmskb %xmm1, %eax 99*d8385768SRobert Clausecker add $32, %rdi # advance to next iteration 100*d8385768SRobert Clausecker shr %cl, %eax # clear out matches in junk bytes 101*d8385768SRobert Clausecker test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) 102*d8385768SRobert Clausecker jnz 2f 103*d8385768SRobert Clausecker 104*d8385768SRobert Clausecker ALIGN_TEXT 105*d8385768SRobert Clausecker1: pxor %xmm1, %xmm1 106*d8385768SRobert Clausecker pcmpeqb -16(%rdi), %xmm1 # find NUL bytes 107*d8385768SRobert Clausecker pmovmskb %xmm1, %eax 108*d8385768SRobert Clausecker test %eax, %eax # were any NUL bytes present? 109*d8385768SRobert Clausecker jnz 3f 110*d8385768SRobert Clausecker 111*d8385768SRobert Clausecker /* the same unrolled once more */ 112*d8385768SRobert Clausecker pxor %xmm1, %xmm1 113*d8385768SRobert Clausecker pcmpeqb (%rdi), %xmm1 114*d8385768SRobert Clausecker pmovmskb %xmm1, %eax 115*d8385768SRobert Clausecker add $32, %rdi # advance to next iteration 116*d8385768SRobert Clausecker test %eax, %eax 117*d8385768SRobert Clausecker jz 1b 118*d8385768SRobert Clausecker 119*d8385768SRobert Clausecker /* match found in loop body */ 120*d8385768SRobert Clausecker sub $16, %rdi # undo half the advancement 121*d8385768SRobert Clausecker3: tzcnt %eax, %eax # find the first NUL byte 122*d8385768SRobert Clausecker sub %rsi, %rdi # string length until beginning of (%rdi) 123*d8385768SRobert Clausecker lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length 124*d8385768SRobert Clausecker ret 125*d8385768SRobert Clausecker 126*d8385768SRobert Clausecker /* match found in head */ 127*d8385768SRobert Clausecker2: tzcnt %eax, %eax # compute string length 128*d8385768SRobert Clausecker ret 129*d8385768SRobert ClauseckerARCHEND(strlen, baseline) 1307f06b217SMateusz Guzik 1317f06b217SMateusz Guzik .section .note.GNU-stack,"",%progbits 132