xref: /freebsd-src/lib/libc/amd64/string/strlen.S (revision 1d386b48a555f61cb7325543adbbb5c3f3407a66)
1*d8385768SRobert Clausecker/*-
27f06b217SMateusz Guzik * Written by Mateusz Guzik <mjg@freebsd.org>
3*d8385768SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
4*d8385768SRobert Clausecker *
5*d8385768SRobert Clausecker * Portions of this software were developed by Robert Clausecker
6*d8385768SRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
7*d8385768SRobert Clausecker *
87f06b217SMateusz Guzik * Public domain.
97f06b217SMateusz Guzik */
107f06b217SMateusz Guzik
117f06b217SMateusz Guzik#include <machine/asm.h>
12*d8385768SRobert Clausecker#include "amd64_archlevel.h"
13*d8385768SRobert Clausecker
147f06b217SMateusz Guzik/*
157f06b217SMateusz Guzik * Note: this routine was written with kernel use in mind (read: no simd),
167f06b217SMateusz Guzik * it is only present in userspace as a temporary measure until something
177f06b217SMateusz Guzik * better gets imported.
187f06b217SMateusz Guzik */
197f06b217SMateusz Guzik
207f06b217SMateusz Guzik#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
217f06b217SMateusz Guzik
22*d8385768SRobert ClauseckerARCHFUNCS(strlen)
23*d8385768SRobert Clausecker	ARCHFUNC(strlen, scalar)
24*d8385768SRobert Clausecker	ARCHFUNC(strlen, baseline)
25*d8385768SRobert ClauseckerENDARCHFUNCS(strlen)
26*d8385768SRobert Clausecker
277f06b217SMateusz Guzik/*
287f06b217SMateusz Guzik * strlen(string)
297f06b217SMateusz Guzik *	  %rdi
307f06b217SMateusz Guzik *
317f06b217SMateusz Guzik * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
327f06b217SMateusz Guzik *
337f06b217SMateusz Guzik * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
347f06b217SMateusz Guzik * with leaq.
357f06b217SMateusz Guzik *
367f06b217SMateusz Guzik * For a description see either:
377f06b217SMateusz Guzik * - "Hacker's Delight" by Henry S. Warren, Jr.
387f06b217SMateusz Guzik * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
397f06b217SMateusz Guzik *   by Agner Fog
407f06b217SMateusz Guzik *
417f06b217SMateusz Guzik * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
427f06b217SMateusz Guzik */
43*d8385768SRobert ClauseckerARCHENTRY(strlen, scalar)
447f06b217SMateusz Guzik	movabsq	$0xfefefefefefefeff,%r8
457f06b217SMateusz Guzik	movabsq	$0x8080808080808080,%r9
467f06b217SMateusz Guzik
477f06b217SMateusz Guzik	movq	%rdi,%r10
487f06b217SMateusz Guzik	movq	%rdi,%rcx
497f06b217SMateusz Guzik	testb	$7,%dil
507f06b217SMateusz Guzik	jz	2f
517f06b217SMateusz Guzik
527f06b217SMateusz Guzik	/*
537f06b217SMateusz Guzik	 * Handle misaligned reads: align to 8 and fill
547f06b217SMateusz Guzik	 * the spurious bytes.
557f06b217SMateusz Guzik	 */
567f06b217SMateusz Guzik	andq	$~7,%rdi
577f06b217SMateusz Guzik	movq	(%rdi),%r11
587f06b217SMateusz Guzik	shlq	$3,%rcx
597f06b217SMateusz Guzik	movq	$-1,%rdx
607f06b217SMateusz Guzik	shlq	%cl,%rdx
617f06b217SMateusz Guzik	notq	%rdx
627f06b217SMateusz Guzik	orq	%rdx,%r11
637f06b217SMateusz Guzik
647f06b217SMateusz Guzik	leaq	(%r11,%r8),%rcx
657f06b217SMateusz Guzik	notq	%r11
667f06b217SMateusz Guzik	andq	%r11,%rcx
677f06b217SMateusz Guzik	andq	%r9,%rcx
687f06b217SMateusz Guzik	jnz	3f
697f06b217SMateusz Guzik
707f06b217SMateusz Guzik	/*
717f06b217SMateusz Guzik	 * Main loop.
727f06b217SMateusz Guzik	 */
737f06b217SMateusz Guzik	ALIGN_TEXT
747f06b217SMateusz Guzik1:
757f06b217SMateusz Guzik	leaq	8(%rdi),%rdi
767f06b217SMateusz Guzik2:
777f06b217SMateusz Guzik	movq	(%rdi),%r11
787f06b217SMateusz Guzik	leaq	(%r11,%r8),%rcx
797f06b217SMateusz Guzik	notq	%r11
807f06b217SMateusz Guzik	andq	%r11,%rcx
817f06b217SMateusz Guzik	andq	%r9,%rcx
827f06b217SMateusz Guzik	jz	1b
837f06b217SMateusz Guzik3:
847f06b217SMateusz Guzik	bsfq	%rcx,%rcx
857f06b217SMateusz Guzik	shrq	$3,%rcx
867f06b217SMateusz Guzik	leaq	(%rcx,%rdi),%rax
877f06b217SMateusz Guzik	subq	%r10,%rax
887f06b217SMateusz Guzik	ret
89*d8385768SRobert ClauseckerARCHEND(strlen, scalar)
90*d8385768SRobert Clausecker
91*d8385768SRobert ClauseckerARCHENTRY(strlen, baseline)
92*d8385768SRobert Clausecker	mov	%rdi, %rcx
93*d8385768SRobert Clausecker	pxor	%xmm1, %xmm1
94*d8385768SRobert Clausecker	and	$~0xf, %rdi			# align string
95*d8385768SRobert Clausecker	pcmpeqb	(%rdi), %xmm1			# compare head (with junk before string)
96*d8385768SRobert Clausecker	mov	%rcx, %rsi			# string pointer copy for later
97*d8385768SRobert Clausecker	and	$0xf, %ecx			# amount of bytes rdi is past 16 byte alignment
98*d8385768SRobert Clausecker	pmovmskb %xmm1, %eax
99*d8385768SRobert Clausecker	add	$32, %rdi			# advance to next iteration
100*d8385768SRobert Clausecker	shr	%cl, %eax			# clear out matches in junk bytes
101*d8385768SRobert Clausecker	test	%eax, %eax			# any match? (can't use ZF from SHR as CL=0 is possible)
102*d8385768SRobert Clausecker	jnz	2f
103*d8385768SRobert Clausecker
104*d8385768SRobert Clausecker	ALIGN_TEXT
105*d8385768SRobert Clausecker1:	pxor	%xmm1, %xmm1
106*d8385768SRobert Clausecker	pcmpeqb	-16(%rdi), %xmm1		# find NUL bytes
107*d8385768SRobert Clausecker	pmovmskb %xmm1, %eax
108*d8385768SRobert Clausecker	test	%eax, %eax			# were any NUL bytes present?
109*d8385768SRobert Clausecker	jnz	3f
110*d8385768SRobert Clausecker
111*d8385768SRobert Clausecker	/* the same unrolled once more */
112*d8385768SRobert Clausecker	pxor	%xmm1, %xmm1
113*d8385768SRobert Clausecker	pcmpeqb	(%rdi), %xmm1
114*d8385768SRobert Clausecker	pmovmskb %xmm1, %eax
115*d8385768SRobert Clausecker	add	$32, %rdi			# advance to next iteration
116*d8385768SRobert Clausecker	test	%eax, %eax
117*d8385768SRobert Clausecker	jz	1b
118*d8385768SRobert Clausecker
119*d8385768SRobert Clausecker	/* match found in loop body */
120*d8385768SRobert Clausecker	sub	$16, %rdi			# undo half the advancement
121*d8385768SRobert Clausecker3:	tzcnt	%eax, %eax			# find the first NUL byte
122*d8385768SRobert Clausecker	sub	%rsi, %rdi			# string length until beginning of (%rdi)
123*d8385768SRobert Clausecker	lea	-16(%rdi, %rax, 1), %rax	# that plus loc. of NUL byte: full string length
124*d8385768SRobert Clausecker	ret
125*d8385768SRobert Clausecker
126*d8385768SRobert Clausecker	/* match found in head */
127*d8385768SRobert Clausecker2:	tzcnt	%eax, %eax			# compute string length
128*d8385768SRobert Clausecker	ret
129*d8385768SRobert ClauseckerARCHEND(strlen, baseline)
1307f06b217SMateusz Guzik
1317f06b217SMateusz Guzik	.section .note.GNU-stack,"",%progbits
132