xref: /freebsd-src/lib/libc/amd64/string/timingsafe_bcmp.S (revision 76c2b331bcd9f73c5c8c43a06e328fa0c7b8c39a)
1*76c2b331SRobert Clausecker/*-
2*76c2b331SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
3*76c2b331SRobert Clausecker *
4*76c2b331SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5*76c2b331SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6*76c2b331SRobert Clausecker *
7*76c2b331SRobert Clausecker * Redistribution and use in source and binary forms, with or without
8*76c2b331SRobert Clausecker * modification, are permitted provided that the following conditions
9*76c2b331SRobert Clausecker * are met:
10*76c2b331SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11*76c2b331SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12*76c2b331SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13*76c2b331SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14*76c2b331SRobert Clausecker *    documentation and/or other materials provided with the distribution.
15*76c2b331SRobert Clausecker *
16*76c2b331SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17*76c2b331SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*76c2b331SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*76c2b331SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*76c2b331SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*76c2b331SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*76c2b331SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*76c2b331SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*76c2b331SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*76c2b331SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*76c2b331SRobert Clausecker * SUCH DAMAGE
27*76c2b331SRobert Clausecker */
28*76c2b331SRobert Clausecker
29*76c2b331SRobert Clausecker#include <machine/asm.h>
30*76c2b331SRobert Clausecker
31*76c2b331SRobert Clausecker#include "amd64_archlevel.h"
32*76c2b331SRobert Clausecker
33*76c2b331SRobert Clausecker#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
34*76c2b331SRobert Clausecker
35*76c2b331SRobert ClauseckerARCHFUNCS(timingsafe_bcmp)
36*76c2b331SRobert Clausecker	ARCHFUNC(timingsafe_bcmp, scalar)
37*76c2b331SRobert Clausecker	ARCHFUNC(timingsafe_bcmp, baseline)
38*76c2b331SRobert ClauseckerENDARCHFUNCS(timingsafe_bcmp)
39*76c2b331SRobert Clausecker
40*76c2b331SRobert ClauseckerARCHENTRY(timingsafe_bcmp, scalar)
41*76c2b331SRobert Clausecker	cmp	$16, %rdx		# at least 17 bytes to process?
42*76c2b331SRobert Clausecker	ja	.Lgt16
43*76c2b331SRobert Clausecker
44*76c2b331SRobert Clausecker	cmp	$8, %edx		# at least 9 bytes to process?
45*76c2b331SRobert Clausecker	ja	.L0916
46*76c2b331SRobert Clausecker
47*76c2b331SRobert Clausecker	cmp	$4, %edx		# at least 5 bytes to process?
48*76c2b331SRobert Clausecker	ja	.L0508
49*76c2b331SRobert Clausecker
50*76c2b331SRobert Clausecker	cmp	$2, %edx		# at least 3 bytes to process?
51*76c2b331SRobert Clausecker	ja	.L0304
52*76c2b331SRobert Clausecker
53*76c2b331SRobert Clausecker	test	%edx, %edx		# buffer empty?
54*76c2b331SRobert Clausecker	jnz	.L0102
55*76c2b331SRobert Clausecker
56*76c2b331SRobert Clausecker	xor	%eax, %eax		# empty buffer always matches
57*76c2b331SRobert Clausecker	ret
58*76c2b331SRobert Clausecker
59*76c2b331SRobert Clausecker.L0102:	movzbl	(%rdi), %eax		# load 1--2 bytes from first buffer
60*76c2b331SRobert Clausecker	movzbl	-1(%rdi, %rdx, 1), %ecx
61*76c2b331SRobert Clausecker	xor	(%rsi), %al		# xor in second buffer
62*76c2b331SRobert Clausecker	xor	-1(%rsi, %rdx, 1), %cl
63*76c2b331SRobert Clausecker	or	%ecx, %eax		# mismatch in any of the two?
64*76c2b331SRobert Clausecker	ret
65*76c2b331SRobert Clausecker
66*76c2b331SRobert Clausecker.L0304:	movzwl	(%rdi), %eax
67*76c2b331SRobert Clausecker	movzwl	-2(%rdi, %rdx, 1), %ecx
68*76c2b331SRobert Clausecker	xor	(%rsi), %ax
69*76c2b331SRobert Clausecker	xor	-2(%rsi, %rdx, 1), %cx
70*76c2b331SRobert Clausecker	or	%ecx, %eax
71*76c2b331SRobert Clausecker	ret
72*76c2b331SRobert Clausecker
73*76c2b331SRobert Clausecker.L0508:	mov	(%rdi), %eax
74*76c2b331SRobert Clausecker	mov	-4(%rdi, %rdx, 1), %ecx
75*76c2b331SRobert Clausecker	xor	(%rsi), %eax
76*76c2b331SRobert Clausecker	xor	-4(%rsi, %rdx, 1), %ecx
77*76c2b331SRobert Clausecker	or	%ecx, %eax
78*76c2b331SRobert Clausecker	ret
79*76c2b331SRobert Clausecker
80*76c2b331SRobert Clausecker.L0916:	mov	(%rdi), %rax
81*76c2b331SRobert Clausecker	mov	-8(%rdi, %rdx, 1), %rcx
82*76c2b331SRobert Clausecker	xor	(%rsi), %rax
83*76c2b331SRobert Clausecker	xor	-8(%rsi, %rdx, 1), %rcx
84*76c2b331SRobert Clausecker	or	%rcx, %rax
85*76c2b331SRobert Clausecker	setnz	%al			# ensure EAX nonzero even if only
86*76c2b331SRobert Clausecker	ret				# high bits of RAX were set
87*76c2b331SRobert Clausecker
88*76c2b331SRobert Clausecker	/* more than 16 bytes: process buffer in a loop */
89*76c2b331SRobert Clausecker.Lgt16:	mov	(%rdi), %rax		# process first 16 bytes
90*76c2b331SRobert Clausecker	mov	8(%rdi), %r9
91*76c2b331SRobert Clausecker	mov	$32, %ecx
92*76c2b331SRobert Clausecker	xor	(%rsi), %rax
93*76c2b331SRobert Clausecker	xor	8(%rsi), %r9
94*76c2b331SRobert Clausecker	or	%r9, %rax
95*76c2b331SRobert Clausecker
96*76c2b331SRobert Clausecker	cmp	%rdx, %rcx		# enough left for a full iteration?
97*76c2b331SRobert Clausecker	jae	.Ltail
98*76c2b331SRobert Clausecker
99*76c2b331SRobert Clausecker	/* main loop processing 16 bytes per iteration */
100*76c2b331SRobert Clausecker	ALIGN_TEXT
101*76c2b331SRobert Clausecker0:	mov	-16(%rdi, %rcx, 1), %r8
102*76c2b331SRobert Clausecker	mov	-8(%rdi, %rcx, 1), %r9
103*76c2b331SRobert Clausecker	xor	-16(%rsi, %rcx, 1), %r8
104*76c2b331SRobert Clausecker	xor	-8(%rsi, %rcx, 1), %r9
105*76c2b331SRobert Clausecker	add	$16, %rcx
106*76c2b331SRobert Clausecker	or	%r9, %r8
107*76c2b331SRobert Clausecker	or	%r8, %rax
108*76c2b331SRobert Clausecker
109*76c2b331SRobert Clausecker	cmp	%rdx, %rcx
110*76c2b331SRobert Clausecker	jb	0b
111*76c2b331SRobert Clausecker
112*76c2b331SRobert Clausecker	/* process last 16 bytes */
113*76c2b331SRobert Clausecker.Ltail:	mov	-16(%rdi, %rdx, 1), %r8
114*76c2b331SRobert Clausecker	mov	-8(%rdi, %rdx, 1), %r9
115*76c2b331SRobert Clausecker	xor	-16(%rsi, %rdx, 1), %r8
116*76c2b331SRobert Clausecker	xor	-8(%rsi, %rdx, 1), %r9
117*76c2b331SRobert Clausecker	or	%r9, %r8
118*76c2b331SRobert Clausecker	or	%r8, %rax
119*76c2b331SRobert Clausecker	setnz	%al
120*76c2b331SRobert Clausecker	ret
121*76c2b331SRobert ClauseckerARCHEND(timingsafe_bcmp, scalar)
122*76c2b331SRobert Clausecker
123*76c2b331SRobert ClauseckerARCHENTRY(timingsafe_bcmp, baseline)
124*76c2b331SRobert Clausecker	cmp	$32, %rdx		# at least 33 bytes to process?
125*76c2b331SRobert Clausecker	ja	.Lgt32b
126*76c2b331SRobert Clausecker
127*76c2b331SRobert Clausecker	cmp	$16, %edx		# at least 17 bytes to process?
128*76c2b331SRobert Clausecker	ja	.L1732b
129*76c2b331SRobert Clausecker
130*76c2b331SRobert Clausecker	cmp	$8, %edx		# at least 9 bytes to process?
131*76c2b331SRobert Clausecker	ja	.L0916b
132*76c2b331SRobert Clausecker
133*76c2b331SRobert Clausecker	cmp	$4, %edx		# at least 5 bytes to process?
134*76c2b331SRobert Clausecker	ja	.L0508b
135*76c2b331SRobert Clausecker
136*76c2b331SRobert Clausecker	cmp	$2, %edx		# at least 3 bytes to process?
137*76c2b331SRobert Clausecker	ja	.L0304b
138*76c2b331SRobert Clausecker
139*76c2b331SRobert Clausecker	test	%edx, %edx		# buffer empty?
140*76c2b331SRobert Clausecker	jnz	.L0102b
141*76c2b331SRobert Clausecker
142*76c2b331SRobert Clausecker	xor	%eax, %eax		# empty buffer always matches
143*76c2b331SRobert Clausecker	ret
144*76c2b331SRobert Clausecker
145*76c2b331SRobert Clausecker.L0102b:
146*76c2b331SRobert Clausecker	movzbl	(%rdi), %eax		# load 1--2 bytes from first buffer
147*76c2b331SRobert Clausecker	movzbl	-1(%rdi, %rdx, 1), %ecx
148*76c2b331SRobert Clausecker	xor	(%rsi), %al		# xor in second buffer
149*76c2b331SRobert Clausecker	xor	-1(%rsi, %rdx, 1), %cl
150*76c2b331SRobert Clausecker	or	%ecx, %eax		# mismatch in any of the two?
151*76c2b331SRobert Clausecker	ret
152*76c2b331SRobert Clausecker
153*76c2b331SRobert Clausecker.L0304b:
154*76c2b331SRobert Clausecker	movzwl	(%rdi), %eax
155*76c2b331SRobert Clausecker	movzwl	-2(%rdi, %rdx, 1), %ecx
156*76c2b331SRobert Clausecker	xor	(%rsi), %ax
157*76c2b331SRobert Clausecker	xor	-2(%rsi, %rdx, 1), %cx
158*76c2b331SRobert Clausecker	or	%ecx, %eax
159*76c2b331SRobert Clausecker	ret
160*76c2b331SRobert Clausecker
161*76c2b331SRobert Clausecker.L0508b:
162*76c2b331SRobert Clausecker	mov	(%rdi), %eax
163*76c2b331SRobert Clausecker	mov	-4(%rdi, %rdx, 1), %ecx
164*76c2b331SRobert Clausecker	xor	(%rsi), %eax
165*76c2b331SRobert Clausecker	xor	-4(%rsi, %rdx, 1), %ecx
166*76c2b331SRobert Clausecker	or	%ecx, %eax
167*76c2b331SRobert Clausecker	ret
168*76c2b331SRobert Clausecker
169*76c2b331SRobert Clausecker.L0916b:
170*76c2b331SRobert Clausecker	mov	(%rdi), %rax
171*76c2b331SRobert Clausecker	mov	-8(%rdi, %rdx, 1), %rcx
172*76c2b331SRobert Clausecker	xor	(%rsi), %rax
173*76c2b331SRobert Clausecker	xor	-8(%rsi, %rdx, 1), %rcx
174*76c2b331SRobert Clausecker	or	%rcx, %rax
175*76c2b331SRobert Clausecker	setnz	%al			# ensure EAX nonzero even if only
176*76c2b331SRobert Clausecker	ret				# high bits of RAX were set
177*76c2b331SRobert Clausecker
178*76c2b331SRobert Clausecker.L1732b:
179*76c2b331SRobert Clausecker	movdqu		(%rdi), %xmm0
180*76c2b331SRobert Clausecker	movdqu		(%rsi), %xmm2
181*76c2b331SRobert Clausecker	movdqu		-16(%rdi, %rdx, 1), %xmm1
182*76c2b331SRobert Clausecker	movdqu		-16(%rsi, %rdx, 1), %xmm3
183*76c2b331SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
184*76c2b331SRobert Clausecker	pcmpeqb		%xmm3, %xmm1
185*76c2b331SRobert Clausecker	pand		%xmm1, %xmm0
186*76c2b331SRobert Clausecker	pmovmskb	%xmm0, %eax	# 1 where equal
187*76c2b331SRobert Clausecker	xor		$0xffff, %eax	# 1 where not equal
188*76c2b331SRobert Clausecker	ret
189*76c2b331SRobert Clausecker
190*76c2b331SRobert Clausecker	/* more than 32 bytes: process buffer in a loop */
191*76c2b331SRobert Clausecker.Lgt32b:
192*76c2b331SRobert Clausecker	movdqu		(%rdi), %xmm4
193*76c2b331SRobert Clausecker	movdqu		(%rsi), %xmm2
194*76c2b331SRobert Clausecker	movdqu		16(%rdi), %xmm1
195*76c2b331SRobert Clausecker	movdqu		16(%rsi), %xmm3
196*76c2b331SRobert Clausecker	mov		$64, %ecx
197*76c2b331SRobert Clausecker	pcmpeqb		%xmm2, %xmm4
198*76c2b331SRobert Clausecker	pcmpeqb		%xmm3, %xmm1
199*76c2b331SRobert Clausecker	pand		%xmm1, %xmm4
200*76c2b331SRobert Clausecker	cmp		%rdx, %rcx	# enough left for a full iteration?
201*76c2b331SRobert Clausecker	jae		.Ltailb
202*76c2b331SRobert Clausecker
203*76c2b331SRobert Clausecker	/* main loop processing 32 bytes per iteration */
204*76c2b331SRobert Clausecker	ALIGN_TEXT
205*76c2b331SRobert Clausecker0:	movdqu		-32(%rdi, %rcx, 1), %xmm0
206*76c2b331SRobert Clausecker	movdqu		-32(%rsi, %rcx, 1), %xmm2
207*76c2b331SRobert Clausecker	movdqu		-16(%rdi, %rcx, 1), %xmm1
208*76c2b331SRobert Clausecker	movdqu		-16(%rsi, %rcx, 1), %xmm3
209*76c2b331SRobert Clausecker	add		$32, %rcx
210*76c2b331SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
211*76c2b331SRobert Clausecker	pcmpeqb		%xmm3, %xmm1
212*76c2b331SRobert Clausecker	pand		%xmm1, %xmm0
213*76c2b331SRobert Clausecker	pand		%xmm0, %xmm4
214*76c2b331SRobert Clausecker	cmp		%rdx, %rcx
215*76c2b331SRobert Clausecker	jb		0b
216*76c2b331SRobert Clausecker
217*76c2b331SRobert Clausecker	/* process last 32 bytes */
218*76c2b331SRobert Clausecker.Ltailb:
219*76c2b331SRobert Clausecker	movdqu		-32(%rdi, %rdx, 1), %xmm0
220*76c2b331SRobert Clausecker	movdqu		-32(%rsi, %rdx, 1), %xmm2
221*76c2b331SRobert Clausecker	movdqu		-16(%rdi, %rdx, 1), %xmm1
222*76c2b331SRobert Clausecker	movdqu		-16(%rsi, %rdx, 1), %xmm3
223*76c2b331SRobert Clausecker	pcmpeqb		%xmm2, %xmm0
224*76c2b331SRobert Clausecker	pcmpeqb		%xmm3, %xmm1
225*76c2b331SRobert Clausecker	pand		%xmm1, %xmm0
226*76c2b331SRobert Clausecker	pand		%xmm4, %xmm0
227*76c2b331SRobert Clausecker	pmovmskb	%xmm0, %eax
228*76c2b331SRobert Clausecker	xor		$0xffff, %eax
229*76c2b331SRobert Clausecker	ret
230*76c2b331SRobert ClauseckerARCHEND(timingsafe_bcmp, baseline)
231*76c2b331SRobert Clausecker
232*76c2b331SRobert Clausecker	.section .note.GNU-stack,"",%progbits
233