1*76c2b331SRobert Clausecker/*- 2*76c2b331SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 3*76c2b331SRobert Clausecker * 4*76c2b331SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5*76c2b331SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6*76c2b331SRobert Clausecker * 7*76c2b331SRobert Clausecker * Redistribution and use in source and binary forms, with or without 8*76c2b331SRobert Clausecker * modification, are permitted provided that the following conditions 9*76c2b331SRobert Clausecker * are met: 10*76c2b331SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11*76c2b331SRobert Clausecker * notice, this list of conditions and the following disclaimer. 12*76c2b331SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13*76c2b331SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14*76c2b331SRobert Clausecker * documentation and/or other materials provided with the distribution. 15*76c2b331SRobert Clausecker * 16*76c2b331SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17*76c2b331SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18*76c2b331SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19*76c2b331SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20*76c2b331SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21*76c2b331SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22*76c2b331SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23*76c2b331SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24*76c2b331SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25*76c2b331SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26*76c2b331SRobert Clausecker * SUCH DAMAGE 27*76c2b331SRobert Clausecker */ 28*76c2b331SRobert Clausecker 29*76c2b331SRobert Clausecker#include <machine/asm.h> 30*76c2b331SRobert Clausecker 31*76c2b331SRobert Clausecker#include "amd64_archlevel.h" 32*76c2b331SRobert Clausecker 33*76c2b331SRobert Clausecker#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 34*76c2b331SRobert Clausecker 35*76c2b331SRobert ClauseckerARCHFUNCS(timingsafe_bcmp) 36*76c2b331SRobert Clausecker ARCHFUNC(timingsafe_bcmp, scalar) 37*76c2b331SRobert Clausecker ARCHFUNC(timingsafe_bcmp, baseline) 38*76c2b331SRobert ClauseckerENDARCHFUNCS(timingsafe_bcmp) 39*76c2b331SRobert Clausecker 40*76c2b331SRobert ClauseckerARCHENTRY(timingsafe_bcmp, scalar) 41*76c2b331SRobert Clausecker cmp $16, %rdx # at least 17 bytes to process? 42*76c2b331SRobert Clausecker ja .Lgt16 43*76c2b331SRobert Clausecker 44*76c2b331SRobert Clausecker cmp $8, %edx # at least 9 bytes to process? 45*76c2b331SRobert Clausecker ja .L0916 46*76c2b331SRobert Clausecker 47*76c2b331SRobert Clausecker cmp $4, %edx # at least 5 bytes to process? 48*76c2b331SRobert Clausecker ja .L0508 49*76c2b331SRobert Clausecker 50*76c2b331SRobert Clausecker cmp $2, %edx # at least 3 bytes to process? 51*76c2b331SRobert Clausecker ja .L0304 52*76c2b331SRobert Clausecker 53*76c2b331SRobert Clausecker test %edx, %edx # buffer empty? 54*76c2b331SRobert Clausecker jnz .L0102 55*76c2b331SRobert Clausecker 56*76c2b331SRobert Clausecker xor %eax, %eax # empty buffer always matches 57*76c2b331SRobert Clausecker ret 58*76c2b331SRobert Clausecker 59*76c2b331SRobert Clausecker.L0102: movzbl (%rdi), %eax # load 1--2 bytes from first buffer 60*76c2b331SRobert Clausecker movzbl -1(%rdi, %rdx, 1), %ecx 61*76c2b331SRobert Clausecker xor (%rsi), %al # xor in second buffer 62*76c2b331SRobert Clausecker xor -1(%rsi, %rdx, 1), %cl 63*76c2b331SRobert Clausecker or %ecx, %eax # mismatch in any of the two? 64*76c2b331SRobert Clausecker ret 65*76c2b331SRobert Clausecker 66*76c2b331SRobert Clausecker.L0304: movzwl (%rdi), %eax 67*76c2b331SRobert Clausecker movzwl -2(%rdi, %rdx, 1), %ecx 68*76c2b331SRobert Clausecker xor (%rsi), %ax 69*76c2b331SRobert Clausecker xor -2(%rsi, %rdx, 1), %cx 70*76c2b331SRobert Clausecker or %ecx, %eax 71*76c2b331SRobert Clausecker ret 72*76c2b331SRobert Clausecker 73*76c2b331SRobert Clausecker.L0508: mov (%rdi), %eax 74*76c2b331SRobert Clausecker mov -4(%rdi, %rdx, 1), %ecx 75*76c2b331SRobert Clausecker xor (%rsi), %eax 76*76c2b331SRobert Clausecker xor -4(%rsi, %rdx, 1), %ecx 77*76c2b331SRobert Clausecker or %ecx, %eax 78*76c2b331SRobert Clausecker ret 79*76c2b331SRobert Clausecker 80*76c2b331SRobert Clausecker.L0916: mov (%rdi), %rax 81*76c2b331SRobert Clausecker mov -8(%rdi, %rdx, 1), %rcx 82*76c2b331SRobert Clausecker xor (%rsi), %rax 83*76c2b331SRobert Clausecker xor -8(%rsi, %rdx, 1), %rcx 84*76c2b331SRobert Clausecker or %rcx, %rax 85*76c2b331SRobert Clausecker setnz %al # ensure EAX nonzero even if only 86*76c2b331SRobert Clausecker ret # high bits of RAX were set 87*76c2b331SRobert Clausecker 88*76c2b331SRobert Clausecker /* more than 16 bytes: process buffer in a loop */ 89*76c2b331SRobert Clausecker.Lgt16: mov (%rdi), %rax # process first 16 bytes 90*76c2b331SRobert Clausecker mov 8(%rdi), %r9 91*76c2b331SRobert Clausecker mov $32, %ecx 92*76c2b331SRobert Clausecker xor (%rsi), %rax 93*76c2b331SRobert Clausecker xor 8(%rsi), %r9 94*76c2b331SRobert Clausecker or %r9, %rax 95*76c2b331SRobert Clausecker 96*76c2b331SRobert Clausecker cmp %rdx, %rcx # enough left for a full iteration? 97*76c2b331SRobert Clausecker jae .Ltail 98*76c2b331SRobert Clausecker 99*76c2b331SRobert Clausecker /* main loop processing 16 bytes per iteration */ 100*76c2b331SRobert Clausecker ALIGN_TEXT 101*76c2b331SRobert Clausecker0: mov -16(%rdi, %rcx, 1), %r8 102*76c2b331SRobert Clausecker mov -8(%rdi, %rcx, 1), %r9 103*76c2b331SRobert Clausecker xor -16(%rsi, %rcx, 1), %r8 104*76c2b331SRobert Clausecker xor -8(%rsi, %rcx, 1), %r9 105*76c2b331SRobert Clausecker add $16, %rcx 106*76c2b331SRobert Clausecker or %r9, %r8 107*76c2b331SRobert Clausecker or %r8, %rax 108*76c2b331SRobert Clausecker 109*76c2b331SRobert Clausecker cmp %rdx, %rcx 110*76c2b331SRobert Clausecker jb 0b 111*76c2b331SRobert Clausecker 112*76c2b331SRobert Clausecker /* process last 16 bytes */ 113*76c2b331SRobert Clausecker.Ltail: mov -16(%rdi, %rdx, 1), %r8 114*76c2b331SRobert Clausecker mov -8(%rdi, %rdx, 1), %r9 115*76c2b331SRobert Clausecker xor -16(%rsi, %rdx, 1), %r8 116*76c2b331SRobert Clausecker xor -8(%rsi, %rdx, 1), %r9 117*76c2b331SRobert Clausecker or %r9, %r8 118*76c2b331SRobert Clausecker or %r8, %rax 119*76c2b331SRobert Clausecker setnz %al 120*76c2b331SRobert Clausecker ret 121*76c2b331SRobert ClauseckerARCHEND(timingsafe_bcmp, scalar) 122*76c2b331SRobert Clausecker 123*76c2b331SRobert ClauseckerARCHENTRY(timingsafe_bcmp, baseline) 124*76c2b331SRobert Clausecker cmp $32, %rdx # at least 33 bytes to process? 125*76c2b331SRobert Clausecker ja .Lgt32b 126*76c2b331SRobert Clausecker 127*76c2b331SRobert Clausecker cmp $16, %edx # at least 17 bytes to process? 128*76c2b331SRobert Clausecker ja .L1732b 129*76c2b331SRobert Clausecker 130*76c2b331SRobert Clausecker cmp $8, %edx # at least 9 bytes to process? 131*76c2b331SRobert Clausecker ja .L0916b 132*76c2b331SRobert Clausecker 133*76c2b331SRobert Clausecker cmp $4, %edx # at least 5 bytes to process? 134*76c2b331SRobert Clausecker ja .L0508b 135*76c2b331SRobert Clausecker 136*76c2b331SRobert Clausecker cmp $2, %edx # at least 3 bytes to process? 137*76c2b331SRobert Clausecker ja .L0304b 138*76c2b331SRobert Clausecker 139*76c2b331SRobert Clausecker test %edx, %edx # buffer empty? 140*76c2b331SRobert Clausecker jnz .L0102b 141*76c2b331SRobert Clausecker 142*76c2b331SRobert Clausecker xor %eax, %eax # empty buffer always matches 143*76c2b331SRobert Clausecker ret 144*76c2b331SRobert Clausecker 145*76c2b331SRobert Clausecker.L0102b: 146*76c2b331SRobert Clausecker movzbl (%rdi), %eax # load 1--2 bytes from first buffer 147*76c2b331SRobert Clausecker movzbl -1(%rdi, %rdx, 1), %ecx 148*76c2b331SRobert Clausecker xor (%rsi), %al # xor in second buffer 149*76c2b331SRobert Clausecker xor -1(%rsi, %rdx, 1), %cl 150*76c2b331SRobert Clausecker or %ecx, %eax # mismatch in any of the two? 151*76c2b331SRobert Clausecker ret 152*76c2b331SRobert Clausecker 153*76c2b331SRobert Clausecker.L0304b: 154*76c2b331SRobert Clausecker movzwl (%rdi), %eax 155*76c2b331SRobert Clausecker movzwl -2(%rdi, %rdx, 1), %ecx 156*76c2b331SRobert Clausecker xor (%rsi), %ax 157*76c2b331SRobert Clausecker xor -2(%rsi, %rdx, 1), %cx 158*76c2b331SRobert Clausecker or %ecx, %eax 159*76c2b331SRobert Clausecker ret 160*76c2b331SRobert Clausecker 161*76c2b331SRobert Clausecker.L0508b: 162*76c2b331SRobert Clausecker mov (%rdi), %eax 163*76c2b331SRobert Clausecker mov -4(%rdi, %rdx, 1), %ecx 164*76c2b331SRobert Clausecker xor (%rsi), %eax 165*76c2b331SRobert Clausecker xor -4(%rsi, %rdx, 1), %ecx 166*76c2b331SRobert Clausecker or %ecx, %eax 167*76c2b331SRobert Clausecker ret 168*76c2b331SRobert Clausecker 169*76c2b331SRobert Clausecker.L0916b: 170*76c2b331SRobert Clausecker mov (%rdi), %rax 171*76c2b331SRobert Clausecker mov -8(%rdi, %rdx, 1), %rcx 172*76c2b331SRobert Clausecker xor (%rsi), %rax 173*76c2b331SRobert Clausecker xor -8(%rsi, %rdx, 1), %rcx 174*76c2b331SRobert Clausecker or %rcx, %rax 175*76c2b331SRobert Clausecker setnz %al # ensure EAX nonzero even if only 176*76c2b331SRobert Clausecker ret # high bits of RAX were set 177*76c2b331SRobert Clausecker 178*76c2b331SRobert Clausecker.L1732b: 179*76c2b331SRobert Clausecker movdqu (%rdi), %xmm0 180*76c2b331SRobert Clausecker movdqu (%rsi), %xmm2 181*76c2b331SRobert Clausecker movdqu -16(%rdi, %rdx, 1), %xmm1 182*76c2b331SRobert Clausecker movdqu -16(%rsi, %rdx, 1), %xmm3 183*76c2b331SRobert Clausecker pcmpeqb %xmm2, %xmm0 184*76c2b331SRobert Clausecker pcmpeqb %xmm3, %xmm1 185*76c2b331SRobert Clausecker pand %xmm1, %xmm0 186*76c2b331SRobert Clausecker pmovmskb %xmm0, %eax # 1 where equal 187*76c2b331SRobert Clausecker xor $0xffff, %eax # 1 where not equal 188*76c2b331SRobert Clausecker ret 189*76c2b331SRobert Clausecker 190*76c2b331SRobert Clausecker /* more than 32 bytes: process buffer in a loop */ 191*76c2b331SRobert Clausecker.Lgt32b: 192*76c2b331SRobert Clausecker movdqu (%rdi), %xmm4 193*76c2b331SRobert Clausecker movdqu (%rsi), %xmm2 194*76c2b331SRobert Clausecker movdqu 16(%rdi), %xmm1 195*76c2b331SRobert Clausecker movdqu 16(%rsi), %xmm3 196*76c2b331SRobert Clausecker mov $64, %ecx 197*76c2b331SRobert Clausecker pcmpeqb %xmm2, %xmm4 198*76c2b331SRobert Clausecker pcmpeqb %xmm3, %xmm1 199*76c2b331SRobert Clausecker pand %xmm1, %xmm4 200*76c2b331SRobert Clausecker cmp %rdx, %rcx # enough left for a full iteration? 201*76c2b331SRobert Clausecker jae .Ltailb 202*76c2b331SRobert Clausecker 203*76c2b331SRobert Clausecker /* main loop processing 32 bytes per iteration */ 204*76c2b331SRobert Clausecker ALIGN_TEXT 205*76c2b331SRobert Clausecker0: movdqu -32(%rdi, %rcx, 1), %xmm0 206*76c2b331SRobert Clausecker movdqu -32(%rsi, %rcx, 1), %xmm2 207*76c2b331SRobert Clausecker movdqu -16(%rdi, %rcx, 1), %xmm1 208*76c2b331SRobert Clausecker movdqu -16(%rsi, %rcx, 1), %xmm3 209*76c2b331SRobert Clausecker add $32, %rcx 210*76c2b331SRobert Clausecker pcmpeqb %xmm2, %xmm0 211*76c2b331SRobert Clausecker pcmpeqb %xmm3, %xmm1 212*76c2b331SRobert Clausecker pand %xmm1, %xmm0 213*76c2b331SRobert Clausecker pand %xmm0, %xmm4 214*76c2b331SRobert Clausecker cmp %rdx, %rcx 215*76c2b331SRobert Clausecker jb 0b 216*76c2b331SRobert Clausecker 217*76c2b331SRobert Clausecker /* process last 32 bytes */ 218*76c2b331SRobert Clausecker.Ltailb: 219*76c2b331SRobert Clausecker movdqu -32(%rdi, %rdx, 1), %xmm0 220*76c2b331SRobert Clausecker movdqu -32(%rsi, %rdx, 1), %xmm2 221*76c2b331SRobert Clausecker movdqu -16(%rdi, %rdx, 1), %xmm1 222*76c2b331SRobert Clausecker movdqu -16(%rsi, %rdx, 1), %xmm3 223*76c2b331SRobert Clausecker pcmpeqb %xmm2, %xmm0 224*76c2b331SRobert Clausecker pcmpeqb %xmm3, %xmm1 225*76c2b331SRobert Clausecker pand %xmm1, %xmm0 226*76c2b331SRobert Clausecker pand %xmm4, %xmm0 227*76c2b331SRobert Clausecker pmovmskb %xmm0, %eax 228*76c2b331SRobert Clausecker xor $0xffff, %eax 229*76c2b331SRobert Clausecker ret 230*76c2b331SRobert ClauseckerARCHEND(timingsafe_bcmp, baseline) 231*76c2b331SRobert Clausecker 232*76c2b331SRobert Clausecker .section .note.GNU-stack,"",%progbits 233