xref: /freebsd-src/lib/libc/amd64/string/memcmp.S (revision 953b93cf24d8871c62416c9bcfca935f1f1853b6)
15bbde333SMateusz Guzik/*-
28803f01eSRobert Clausecker * Copyright (c) 2018, 2023 The FreeBSD Foundation
35bbde333SMateusz Guzik *
45bbde333SMateusz Guzik * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
55bbde333SMateusz Guzik * under sponsorship from the FreeBSD Foundation.
65bbde333SMateusz Guzik *
78803f01eSRobert Clausecker * Portions of this software were developed by Robert Clausecker
88803f01eSRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
98803f01eSRobert Clausecker *
105bbde333SMateusz Guzik * Redistribution and use in source and binary forms, with or without
115bbde333SMateusz Guzik * modification, are permitted provided that the following conditions
125bbde333SMateusz Guzik * are met:
135bbde333SMateusz Guzik * 1. Redistributions of source code must retain the above copyright
145bbde333SMateusz Guzik *    notice, this list of conditions and the following disclaimer.
155bbde333SMateusz Guzik * 2. Redistributions in binary form must reproduce the above copyright
165bbde333SMateusz Guzik *    notice, this list of conditions and the following disclaimer in the
175bbde333SMateusz Guzik *    documentation and/or other materials provided with the distribution.
185bbde333SMateusz Guzik *
195bbde333SMateusz Guzik * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
205bbde333SMateusz Guzik * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
215bbde333SMateusz Guzik * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
225bbde333SMateusz Guzik * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
235bbde333SMateusz Guzik * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
245bbde333SMateusz Guzik * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
255bbde333SMateusz Guzik * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
265bbde333SMateusz Guzik * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
275bbde333SMateusz Guzik * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
285bbde333SMateusz Guzik * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
295bbde333SMateusz Guzik * SUCH DAMAGE.
3026f6218bSAlan Cox */
3126f6218bSAlan Cox
3226f6218bSAlan Cox#include <machine/asm.h>
338803f01eSRobert Clausecker#include <machine/param.h>
348803f01eSRobert Clausecker
358803f01eSRobert Clausecker#include "amd64_archlevel.h"
368803f01eSRobert Clausecker
370db6aef4SMateusz Guzik/*
380db6aef4SMateusz Guzik * Note: this routine was written with kernel use in mind (read: no simd),
390db6aef4SMateusz Guzik * it is only present in userspace as a temporary measure until something
400db6aef4SMateusz Guzik * better gets imported.
410db6aef4SMateusz Guzik */
420db6aef4SMateusz Guzik
434846152aSMateusz Guzik#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
444846152aSMateusz Guzik
45fbc002cbSMateusz Guzik#ifdef BCMP
468803f01eSRobert Clausecker#define memcmp bcmp
47fbc002cbSMateusz Guzik#endif
488803f01eSRobert Clausecker
498803f01eSRobert ClauseckerARCHFUNCS(memcmp)
508803f01eSRobert Clausecker	ARCHFUNC(memcmp, scalar)
518803f01eSRobert Clausecker	ARCHFUNC(memcmp, baseline)
528803f01eSRobert ClauseckerENDARCHFUNCS(memcmp)
538803f01eSRobert Clausecker
548803f01eSRobert ClauseckerARCHENTRY(memcmp, scalar)
555bbde333SMateusz Guzik	xorl	%eax,%eax
564846152aSMateusz Guzik10:
574846152aSMateusz Guzik	cmpq	$16,%rdx
584846152aSMateusz Guzik	ja	101632f
594846152aSMateusz Guzik
604846152aSMateusz Guzik	cmpb	$8,%dl
61f1be262eSMateusz Guzik	jg	100816f
62f1be262eSMateusz Guzik
63f1be262eSMateusz Guzik	cmpb	$4,%dl
64f1be262eSMateusz Guzik	jg	100408f
65f1be262eSMateusz Guzik
66f1be262eSMateusz Guzik	cmpb	$2,%dl
67f1be262eSMateusz Guzik	jge	100204f
68f1be262eSMateusz Guzik
69f1be262eSMateusz Guzik	cmpb	$1,%dl
70f1be262eSMateusz Guzik	jl	100000f
71f1be262eSMateusz Guzik	movzbl	(%rdi),%eax
72f1be262eSMateusz Guzik	movzbl	(%rsi),%r8d
73f1be262eSMateusz Guzik	subl	%r8d,%eax
74f1be262eSMateusz Guzik100000:
75f1be262eSMateusz Guzik	ret
76f1be262eSMateusz Guzik
77f1be262eSMateusz Guzik	ALIGN_TEXT
78f1be262eSMateusz Guzik100816:
795bbde333SMateusz Guzik	movq	(%rdi),%r8
805bbde333SMateusz Guzik	movq	(%rsi),%r9
815bbde333SMateusz Guzik	cmpq	%r8,%r9
828291e887SMateusz Guzik	jne	80f
834846152aSMateusz Guzik	movq	-8(%rdi,%rdx),%r8
844846152aSMateusz Guzik	movq	-8(%rsi,%rdx),%r9
854846152aSMateusz Guzik	cmpq	%r8,%r9
864846152aSMateusz Guzik	jne	10081608f
874846152aSMateusz Guzik	ret
88f1be262eSMateusz Guzik	ALIGN_TEXT
894846152aSMateusz Guzik100408:
908291e887SMateusz Guzik	movl	(%rdi),%r8d
918291e887SMateusz Guzik	movl	(%rsi),%r9d
924846152aSMateusz Guzik	cmpl	%r8d,%r9d
938291e887SMateusz Guzik	jne	80f
948291e887SMateusz Guzik	movl	-4(%rdi,%rdx),%r8d
958291e887SMateusz Guzik	movl	-4(%rsi,%rdx),%r9d
964846152aSMateusz Guzik	cmpl	%r8d,%r9d
978291e887SMateusz Guzik	jne	10040804f
984846152aSMateusz Guzik	ret
99f1be262eSMateusz Guzik	ALIGN_TEXT
1004846152aSMateusz Guzik100204:
1018291e887SMateusz Guzik	movzwl	(%rdi),%r8d
1028291e887SMateusz Guzik	movzwl	(%rsi),%r9d
1034846152aSMateusz Guzik	cmpl	%r8d,%r9d
1044846152aSMateusz Guzik	jne	1f
1058291e887SMateusz Guzik	movzwl	-2(%rdi,%rdx),%r8d
1068291e887SMateusz Guzik	movzwl	-2(%rsi,%rdx),%r9d
1074846152aSMateusz Guzik	cmpl	%r8d,%r9d
1084846152aSMateusz Guzik	jne	1f
1094846152aSMateusz Guzik	ret
1104846152aSMateusz Guzik	ALIGN_TEXT
1114846152aSMateusz Guzik101632:
1124846152aSMateusz Guzik	cmpq	$32,%rdx
1134846152aSMateusz Guzik	ja	103200f
1144846152aSMateusz Guzik	movq	(%rdi),%r8
1154846152aSMateusz Guzik	movq	(%rsi),%r9
1164846152aSMateusz Guzik	cmpq	%r8,%r9
1178291e887SMateusz Guzik	jne	80f
1184846152aSMateusz Guzik	movq	8(%rdi),%r8
1195bbde333SMateusz Guzik	movq	8(%rsi),%r9
1204846152aSMateusz Guzik	cmpq	%r8,%r9
1214846152aSMateusz Guzik	jne	10163208f
1224846152aSMateusz Guzik	movq	-16(%rdi,%rdx),%r8
1234846152aSMateusz Guzik	movq	-16(%rsi,%rdx),%r9
1244846152aSMateusz Guzik	cmpq	%r8,%r9
1254846152aSMateusz Guzik	jne	10163216f
1264846152aSMateusz Guzik	movq	-8(%rdi,%rdx),%r8
1274846152aSMateusz Guzik	movq	-8(%rsi,%rdx),%r9
1284846152aSMateusz Guzik	cmpq	%r8,%r9
1294846152aSMateusz Guzik	jne	10163224f
1304846152aSMateusz Guzik	ret
1314846152aSMateusz Guzik	ALIGN_TEXT
1324846152aSMateusz Guzik103200:
1334846152aSMateusz Guzik	movq	(%rdi),%r8
1344846152aSMateusz Guzik	movq	8(%rdi),%r9
1354846152aSMateusz Guzik	subq	(%rsi),%r8
1364846152aSMateusz Guzik	subq	8(%rsi),%r9
1378291e887SMateusz Guzik	orq	%r8,%r9
1384846152aSMateusz Guzik	jnz	10320000f
1395bbde333SMateusz Guzik
1404846152aSMateusz Guzik	movq    16(%rdi),%r8
1414846152aSMateusz Guzik	movq    24(%rdi),%r9
1424846152aSMateusz Guzik	subq    16(%rsi),%r8
1434846152aSMateusz Guzik	subq    24(%rsi),%r9
1448291e887SMateusz Guzik	orq	%r8,%r9
1454846152aSMateusz Guzik	jnz     10320016f
1465bbde333SMateusz Guzik
1475bbde333SMateusz Guzik	leaq	32(%rdi),%rdi
1485bbde333SMateusz Guzik	leaq	32(%rsi),%rsi
1495bbde333SMateusz Guzik	subq	$32,%rdx
1505bbde333SMateusz Guzik	cmpq	$32,%rdx
1514846152aSMateusz Guzik	jae	103200b
1524846152aSMateusz Guzik	cmpb	$0,%dl
1534846152aSMateusz Guzik	jne	10b
1544846152aSMateusz Guzik	ret
1554846152aSMateusz Guzik
1568291e887SMateusz Guzik/*
1578291e887SMateusz Guzik * Mismatch was found.
158fbc002cbSMateusz Guzik */
159fbc002cbSMateusz Guzik#ifdef BCMP
160fbc002cbSMateusz Guzik	ALIGN_TEXT
161fbc002cbSMateusz Guzik10320016:
162fbc002cbSMateusz Guzik10320000:
163fbc002cbSMateusz Guzik10081608:
164fbc002cbSMateusz Guzik10163224:
165fbc002cbSMateusz Guzik10163216:
166fbc002cbSMateusz Guzik10163208:
167fbc002cbSMateusz Guzik10040804:
168fbc002cbSMateusz Guzik80:
169fbc002cbSMateusz Guzik1:
170fbc002cbSMateusz Guzik	leal	1(%eax),%eax
171fbc002cbSMateusz Guzik	ret
172fbc002cbSMateusz Guzik#else
173fbc002cbSMateusz Guzik/*
174fbc002cbSMateusz Guzik * We need to compute the difference between strings.
175fbc002cbSMateusz Guzik * Start with narrowing the range down (16 -> 8 -> 4 bytes).
1768291e887SMateusz Guzik */
1778291e887SMateusz Guzik	ALIGN_TEXT
1784846152aSMateusz Guzik10320016:
1794846152aSMateusz Guzik	leaq	16(%rdi),%rdi
1804846152aSMateusz Guzik	leaq	16(%rsi),%rsi
1814846152aSMateusz Guzik10320000:
1824846152aSMateusz Guzik	movq	(%rdi),%r8
1834846152aSMateusz Guzik	movq	(%rsi),%r9
1844846152aSMateusz Guzik	cmpq	%r8,%r9
1858291e887SMateusz Guzik	jne	80f
1864846152aSMateusz Guzik	leaq	8(%rdi),%rdi
1874846152aSMateusz Guzik	leaq	8(%rsi),%rsi
1888291e887SMateusz Guzik	jmp	80f
1898291e887SMateusz Guzik	ALIGN_TEXT
1908291e887SMateusz Guzik10081608:
1914846152aSMateusz Guzik10163224:
1924846152aSMateusz Guzik	leaq	-8(%rdi,%rdx),%rdi
1934846152aSMateusz Guzik	leaq	-8(%rsi,%rdx),%rsi
1948291e887SMateusz Guzik	jmp	80f
1958291e887SMateusz Guzik	ALIGN_TEXT
1964846152aSMateusz Guzik10163216:
1974846152aSMateusz Guzik	leaq	-16(%rdi,%rdx),%rdi
1984846152aSMateusz Guzik	leaq	-16(%rsi,%rdx),%rsi
1998291e887SMateusz Guzik	jmp	80f
2008291e887SMateusz Guzik	ALIGN_TEXT
2014846152aSMateusz Guzik10163208:
2024846152aSMateusz Guzik	leaq	8(%rdi),%rdi
2034846152aSMateusz Guzik	leaq	8(%rsi),%rsi
2048291e887SMateusz Guzik	jmp	80f
2058291e887SMateusz Guzik	ALIGN_TEXT
2068291e887SMateusz Guzik10040804:
2078291e887SMateusz Guzik	leaq	-4(%rdi,%rdx),%rdi
2088291e887SMateusz Guzik	leaq	-4(%rsi,%rdx),%rsi
2094846152aSMateusz Guzik	jmp	1f
2104846152aSMateusz Guzik
2114846152aSMateusz Guzik	ALIGN_TEXT
2128291e887SMateusz Guzik80:
2138291e887SMateusz Guzik	movl	(%rdi),%r8d
2148291e887SMateusz Guzik	movl	(%rsi),%r9d
2158291e887SMateusz Guzik	cmpl	%r8d,%r9d
2168291e887SMateusz Guzik	jne	1f
2178291e887SMateusz Guzik	leaq	4(%rdi),%rdi
2188291e887SMateusz Guzik	leaq	4(%rsi),%rsi
2198291e887SMateusz Guzik
2208291e887SMateusz Guzik/*
2218291e887SMateusz Guzik * We have up to 4 bytes to inspect.
2228291e887SMateusz Guzik */
2234846152aSMateusz Guzik1:
2244846152aSMateusz Guzik	movzbl	(%rdi),%eax
2254846152aSMateusz Guzik	movzbl	(%rsi),%r8d
2264846152aSMateusz Guzik	cmpb	%r8b,%al
2274846152aSMateusz Guzik	jne	2f
2284846152aSMateusz Guzik
2294846152aSMateusz Guzik	movzbl	1(%rdi),%eax
2304846152aSMateusz Guzik	movzbl	1(%rsi),%r8d
2314846152aSMateusz Guzik	cmpb	%r8b,%al
2324846152aSMateusz Guzik	jne	2f
2334846152aSMateusz Guzik
2344846152aSMateusz Guzik	movzbl	2(%rdi),%eax
2354846152aSMateusz Guzik	movzbl	2(%rsi),%r8d
2364846152aSMateusz Guzik	cmpb	%r8b,%al
2374846152aSMateusz Guzik	jne	2f
2384846152aSMateusz Guzik
2394846152aSMateusz Guzik	movzbl	3(%rdi),%eax
2404846152aSMateusz Guzik	movzbl	3(%rsi),%r8d
2414846152aSMateusz Guzik2:
2424846152aSMateusz Guzik	subl	%r8d,%eax
2434846152aSMateusz Guzik	ret
244fbc002cbSMateusz Guzik#endif
2458803f01eSRobert ClauseckerARCHEND(memcmp, scalar)
2468803f01eSRobert Clausecker
2478803f01eSRobert ClauseckerARCHENTRY(memcmp, baseline)
2488803f01eSRobert Clausecker	cmp		$32, %rdx		# enough to permit use of the long kernel?
2498803f01eSRobert Clausecker	ja		.Llong
2508803f01eSRobert Clausecker
2518803f01eSRobert Clausecker	test		%rdx, %rdx		# zero bytes buffer?
2528803f01eSRobert Clausecker	je		.L0
2538803f01eSRobert Clausecker
2548803f01eSRobert Clausecker	/*
2558803f01eSRobert Clausecker	 * Compare strings of 1--32 bytes.  We want to do this by
2568803f01eSRobert Clausecker	 * loading into two xmm registers and then comparing.  To avoid
2578803f01eSRobert Clausecker	 * crossing into unmapped pages, we either load 32 bytes from
2588803f01eSRobert Clausecker	 * the start of the buffer or 32 bytes before its end, depending
2598803f01eSRobert Clausecker	 * on whether there is a page boundary between the overread area
2608803f01eSRobert Clausecker	 * or not.
2618803f01eSRobert Clausecker	 */
2628803f01eSRobert Clausecker
2638803f01eSRobert Clausecker	/* check for page boundaries overreads */
2648803f01eSRobert Clausecker	lea		31(%rdi), %eax		# end of overread
2658803f01eSRobert Clausecker	lea		31(%rsi), %r8d
2668803f01eSRobert Clausecker	lea		-1(%rdi, %rdx, 1), %ecx	# last character in buffer
2678803f01eSRobert Clausecker	lea		-1(%rsi, %rdx, 1), %r9d
2688803f01eSRobert Clausecker	xor		%ecx, %eax
2698803f01eSRobert Clausecker	xor		%r9d, %r8d
2708803f01eSRobert Clausecker	test		$PAGE_SIZE, %eax	# are they on different pages?
2718803f01eSRobert Clausecker	jz		0f
2728803f01eSRobert Clausecker
2738803f01eSRobert Clausecker	/* fix up rdi */
2748803f01eSRobert Clausecker	movdqu		-32(%rdi, %rdx, 1), %xmm0
2758803f01eSRobert Clausecker	movdqu		-16(%rdi, %rdx, 1), %xmm1
2768803f01eSRobert Clausecker	lea		-8(%rsp), %rdi		# end of replacement buffer
2778803f01eSRobert Clausecker	sub		%rdx, %rdi		# start of replacement buffer
2788803f01eSRobert Clausecker	movdqa		%xmm0, -40(%rsp)	# copy to replacement buffer
2798803f01eSRobert Clausecker	movdqa		%xmm1, -24(%rsp)
2808803f01eSRobert Clausecker
2818803f01eSRobert Clausecker0:	test		$PAGE_SIZE, %r8d
2828803f01eSRobert Clausecker	jz		0f
2838803f01eSRobert Clausecker
2848803f01eSRobert Clausecker	/* fix up rsi */
2858803f01eSRobert Clausecker	movdqu		-32(%rsi, %rdx, 1), %xmm0
2868803f01eSRobert Clausecker	movdqu		-16(%rsi, %rdx, 1), %xmm1
2878803f01eSRobert Clausecker	lea		-40(%rsp), %rsi		# end of replacement buffer
2888803f01eSRobert Clausecker	sub		%rdx, %rsi		# start of replacement buffer
2898803f01eSRobert Clausecker	movdqa		%xmm0, -72(%rsp)	# copy to replacement buffer
2908803f01eSRobert Clausecker	movdqa		%xmm1, -56(%rsp)
2918803f01eSRobert Clausecker
2928803f01eSRobert Clausecker	/* load data and compare properly */
2938803f01eSRobert Clausecker0:	movdqu		16(%rdi), %xmm1
2948803f01eSRobert Clausecker	movdqu		16(%rsi), %xmm3
2958803f01eSRobert Clausecker	movdqu		(%rdi), %xmm0
2968803f01eSRobert Clausecker	movdqu		(%rsi), %xmm2
2978803f01eSRobert Clausecker	mov		%edx, %ecx
2988803f01eSRobert Clausecker	mov		$-1, %edx
2998803f01eSRobert Clausecker	shl		%cl, %rdx		# ones where the buffer is not
3008803f01eSRobert Clausecker	pcmpeqb		%xmm3, %xmm1
3018803f01eSRobert Clausecker	pcmpeqb		%xmm2, %xmm0
3028803f01eSRobert Clausecker	pmovmskb	%xmm1, %ecx
3038803f01eSRobert Clausecker	pmovmskb	%xmm0, %eax
3048803f01eSRobert Clausecker	shl		$16, %ecx
3058803f01eSRobert Clausecker	or		%ecx, %eax		# ones where the buffers match
3068803f01eSRobert Clausecker	or		%edx, %eax		# including where the buffer is not
3078803f01eSRobert Clausecker	not		%eax			# ones where there is a mismatch
3088803f01eSRobert Clausecker#ifndef BCMP
3098803f01eSRobert Clausecker	bsf		%eax, %edx		# location of the first mismatch
3108803f01eSRobert Clausecker	cmovz		%eax, %edx		# including if there is no mismatch
3118803f01eSRobert Clausecker	movzbl		(%rdi, %rdx, 1), %eax	# mismatching bytes
3128803f01eSRobert Clausecker	movzbl		(%rsi, %rdx, 1), %edx
3138803f01eSRobert Clausecker	sub		%edx, %eax
3148803f01eSRobert Clausecker#endif
3158803f01eSRobert Clausecker	ret
3168803f01eSRobert Clausecker
3178803f01eSRobert Clausecker	/* empty input */
3188803f01eSRobert Clausecker.L0:	xor		%eax, %eax
3198803f01eSRobert Clausecker	ret
3208803f01eSRobert Clausecker
3218803f01eSRobert Clausecker	/* compare 33+ bytes */
3228803f01eSRobert Clausecker	ALIGN_TEXT
3238803f01eSRobert Clausecker.Llong:	movdqu		(%rdi), %xmm0		# load head
3248803f01eSRobert Clausecker	movdqu		(%rsi), %xmm2
3258803f01eSRobert Clausecker	mov		%rdi, %rcx
3268803f01eSRobert Clausecker	sub		%rdi, %rsi		# express rsi as distance from rdi
3278803f01eSRobert Clausecker	and		$~0xf, %rdi		# align rdi to 16 bytes
3288803f01eSRobert Clausecker	movdqu		16(%rsi, %rdi, 1), %xmm1
3298803f01eSRobert Clausecker	pcmpeqb		16(%rdi), %xmm1		# compare second half of this iteration
3308803f01eSRobert Clausecker	add		%rcx, %rdx		# pointer to last byte in buffer
331*953b93cfSRobert Clausecker	jc		.Loverflow		# did this overflow?
332*953b93cfSRobert Clausecker0:	pcmpeqb		%xmm2, %xmm0
3338803f01eSRobert Clausecker	pmovmskb	%xmm0, %eax
3348803f01eSRobert Clausecker	xor		$0xffff, %eax		# any mismatch?
3358803f01eSRobert Clausecker	jne		.Lmismatch_head
3368803f01eSRobert Clausecker	add		$64, %rdi		# advance to next iteration
3378803f01eSRobert Clausecker	jmp		1f			# and get going with the loop
3388803f01eSRobert Clausecker
339*953b93cfSRobert Clausecker	/*
340*953b93cfSRobert Clausecker	 * If we got here, a buffer length was passed to memcmp(a, b, len)
341*953b93cfSRobert Clausecker	 * such that a + len < a.  While this sort of usage is illegal,
342*953b93cfSRobert Clausecker	 * it is plausible that a caller tries to do something like
343*953b93cfSRobert Clausecker	 * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending
344*953b93cfSRobert Clausecker	 * for memcmp() to stop comparing at the first mismatch.  This
345*953b93cfSRobert Clausecker	 * behaviour is not guaranteed by any version of ISO/IEC 9899,
346*953b93cfSRobert Clausecker	 * but usually works out in practice.  Let's try to make this
347*953b93cfSRobert Clausecker	 * case work by comparing until the end of the address space.
348*953b93cfSRobert Clausecker	 */
349*953b93cfSRobert Clausecker.Loverflow:
350*953b93cfSRobert Clausecker	mov		$-1, %rdx		# compare until the end of memory
351*953b93cfSRobert Clausecker	jmp		0b
352*953b93cfSRobert Clausecker
3538803f01eSRobert Clausecker	/* process buffer 32 bytes at a time */
3548803f01eSRobert Clausecker	ALIGN_TEXT
3558803f01eSRobert Clausecker0:	movdqu		-32(%rsi, %rdi, 1), %xmm0
3568803f01eSRobert Clausecker	movdqu		-16(%rsi, %rdi, 1), %xmm1
3578803f01eSRobert Clausecker	pcmpeqb		-32(%rdi), %xmm0
3588803f01eSRobert Clausecker	pcmpeqb		-16(%rdi), %xmm1
3598803f01eSRobert Clausecker	add		$32, %rdi		# advance to next iteration
3608803f01eSRobert Clausecker1:	pand		%xmm0, %xmm1		# 0xff where both halves matched
3618803f01eSRobert Clausecker	pmovmskb	%xmm1, %eax
3628803f01eSRobert Clausecker	cmp		$0xffff, %eax		# all bytes matched?
3638803f01eSRobert Clausecker	jne		.Lmismatch
3648803f01eSRobert Clausecker	cmp		%rdx, %rdi		# end of buffer reached?
3658803f01eSRobert Clausecker	jb		0b
3668803f01eSRobert Clausecker
3678803f01eSRobert Clausecker	/* less than 32 bytes left to compare */
3688803f01eSRobert Clausecker	movdqu		-16(%rdx), %xmm1	# load 32 byte tail through end pointer
3698803f01eSRobert Clausecker	movdqu		-16(%rdx, %rsi, 1), %xmm3
3708803f01eSRobert Clausecker	movdqu		-32(%rdx), %xmm0
3718803f01eSRobert Clausecker	movdqu		-32(%rdx, %rsi, 1), %xmm2
3728803f01eSRobert Clausecker	pcmpeqb		%xmm3, %xmm1
3738803f01eSRobert Clausecker	pcmpeqb		%xmm2, %xmm0
3748803f01eSRobert Clausecker	pmovmskb	%xmm1, %ecx
3758803f01eSRobert Clausecker	pmovmskb	%xmm0, %eax
3768803f01eSRobert Clausecker	shl		$16, %ecx
3778803f01eSRobert Clausecker	or		%ecx, %eax		# ones where the buffers match
3788803f01eSRobert Clausecker	not		%eax			# ones where there is a mismatch
3798803f01eSRobert Clausecker#ifndef BCMP
3808803f01eSRobert Clausecker	bsf		%eax, %ecx		# location of the first mismatch
3818803f01eSRobert Clausecker	cmovz		%eax, %ecx		# including if there is no mismatch
3828803f01eSRobert Clausecker	add		%rcx, %rdx		# pointer to potential mismatch
3838803f01eSRobert Clausecker	movzbl		-32(%rdx), %eax		# mismatching bytes
3848803f01eSRobert Clausecker	movzbl		-32(%rdx, %rsi, 1), %edx
3858803f01eSRobert Clausecker	sub		%edx, %eax
3868803f01eSRobert Clausecker#endif
3878803f01eSRobert Clausecker	ret
3888803f01eSRobert Clausecker
3898803f01eSRobert Clausecker#ifdef BCMP
3908803f01eSRobert Clausecker.Lmismatch:
3918803f01eSRobert Clausecker	mov		$1, %eax
3928803f01eSRobert Clausecker.Lmismatch_head:
3938803f01eSRobert Clausecker	ret
3948803f01eSRobert Clausecker#else /* memcmp */
3958803f01eSRobert Clausecker.Lmismatch_head:
3968803f01eSRobert Clausecker	tzcnt		%eax, %eax		# location of mismatch
3978803f01eSRobert Clausecker	add		%rax, %rcx		# pointer to mismatch
3988803f01eSRobert Clausecker	movzbl		(%rcx), %eax		# mismatching bytes
3998803f01eSRobert Clausecker	movzbl		(%rcx, %rsi, 1), %ecx
4008803f01eSRobert Clausecker	sub		%ecx, %eax
4018803f01eSRobert Clausecker	ret
4028803f01eSRobert Clausecker
4038803f01eSRobert Clausecker.Lmismatch:
4048803f01eSRobert Clausecker	movdqu		-48(%rsi, %rdi, 1), %xmm1
4058803f01eSRobert Clausecker	pcmpeqb		-48(%rdi), %xmm1	# reconstruct xmm1 before PAND
4068803f01eSRobert Clausecker	pmovmskb	%xmm0, %eax		# mismatches in first 16 bytes
4078803f01eSRobert Clausecker	pmovmskb	%xmm1, %edx		# mismatches in second 16 bytes
4088803f01eSRobert Clausecker	shl		$16, %edx
4098803f01eSRobert Clausecker	or		%edx, %eax		# mismatches in both
4108803f01eSRobert Clausecker	not		%eax			# matches in both
4118803f01eSRobert Clausecker	tzcnt		%eax, %eax		# location of mismatch
4128803f01eSRobert Clausecker	add		%rax, %rdi		# pointer to mismatch
4138803f01eSRobert Clausecker	movzbl		-64(%rdi), %eax		# mismatching bytes
4148803f01eSRobert Clausecker	movzbl		-64(%rdi, %rsi, 1), %ecx
4158803f01eSRobert Clausecker	sub		%ecx, %eax
4168803f01eSRobert Clausecker	ret
4178803f01eSRobert Clausecker#endif
4188803f01eSRobert ClauseckerARCHEND(memcmp, baseline)
41993ab7586SKonstantin Belousov
42093ab7586SKonstantin Belousov	.section .note.GNU-stack,"",%progbits
421