1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# April 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+32 bytes shared table]. There is no 15# experimental performance data available yet. The only approximation 16# that can be made at this point is based on code size. Inner loop is 17# 32 instructions long and on single-issue core should execute in <40 18# cycles. Having verified that gcc 3.4 didn't unroll corresponding 19# loop, this assembler loop body was found to be ~3x smaller than 20# compiler-generated one... 21# 22# July 2010 23# 24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on 25# Cortex A8 core and ~25 cycles per processed byte (which was observed 26# to be ~3 times faster than gcc-generated code:-) 27# 28# February 2011 29# 30# Profiler-assisted and platform-specific optimization resulted in 7% 31# improvement on Cortex A8 core and ~23.5 cycles per byte. 32# 33# March 2011 34# 35# Add NEON implementation featuring polynomial multiplication, i.e. no 36# lookup tables involved. On Cortex A8 it was measured to process one 37# byte in 15 cycles or 55% faster than integer-only code. 38 39# ==================================================================== 40# Note about "528B" variant. In ARM case it makes lesser sense to 41# implement it for following reasons: 42# 43# - performance improvement won't be anywhere near 50%, because 128- 44# bit shift operation is neatly fused with 128-bit xor here, and 45# "538B" variant would eliminate only 4-5 instructions out of 32 46# in the inner loop (meaning that estimated improvement is ~15%); 47# - ARM-based systems are often embedded ones and extra memory 48# consumption might be unappreciated (for so little improvement); 49# 50# Byte order [in]dependence. ========================================= 51# 52# Caller is expected to maintain specific *dword* order in Htable, 53# namely with *least* significant dword of 128-bit value at *lower* 54# address. This differs completely from C code and has everything to 55# do with ldm instruction and order in which dwords are "consumed" by 56# algorithm. *Byte* order within these dwords in turn is whatever 57# *native* byte order on current platform. See gcm128.c for working 58# example... 59 60while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 61open STDOUT,">$output"; 62 63$Xi="r0"; # argument block 64$Htbl="r1"; 65$inp="r2"; 66$len="r3"; 67 68$Zll="r4"; # variables 69$Zlh="r5"; 70$Zhl="r6"; 71$Zhh="r7"; 72$Tll="r8"; 73$Tlh="r9"; 74$Thl="r10"; 75$Thh="r11"; 76$nlo="r12"; 77################# r13 is stack pointer 78$nhi="r14"; 79################# r15 is program counter 80 81$rem_4bit=$inp; # used in gcm_gmult_4bit 82$cnt=$len; 83 84sub Zsmash() { 85 my $i=12; 86 my @args=@_; 87 for ($Zll,$Zlh,$Zhl,$Zhh) { 88 $code.=<<___; 89#if __ARM_ARCH__>=7 && defined(__ARMEL__) 90 rev $_,$_ 91 str $_,[$Xi,#$i] 92#elif defined(__ARMEB__) 93 str $_,[$Xi,#$i] 94#else 95 mov $Tlh,$_,lsr#8 96 strb $_,[$Xi,#$i+3] 97 mov $Thl,$_,lsr#16 98 strb $Tlh,[$Xi,#$i+2] 99 mov $Thh,$_,lsr#24 100 strb $Thl,[$Xi,#$i+1] 101 strb $Thh,[$Xi,#$i] 102#endif 103___ 104 $code.="\t".shift(@args)."\n"; 105 $i-=4; 106 } 107} 108 109$code=<<___; 110#include "arm_arch.h" 111 112.text 113.syntax unified 114.code 32 115 116.type rem_4bit,%object 117.align 5 118rem_4bit: 119.short 0x0000,0x1C20,0x3840,0x2460 120.short 0x7080,0x6CA0,0x48C0,0x54E0 121.short 0xE100,0xFD20,0xD940,0xC560 122.short 0x9180,0x8DA0,0xA9C0,0xB5E0 123.size rem_4bit,.-rem_4bit 124 125.type rem_4bit_get,%function 126rem_4bit_get: 127 sub $rem_4bit,pc,#8 128 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit 129 b .Lrem_4bit_got 130 nop 131.size rem_4bit_get,.-rem_4bit_get 132 133.global gcm_ghash_4bit 134.type gcm_ghash_4bit,%function 135gcm_ghash_4bit: 136 sub r12,pc,#8 137 add $len,$inp,$len @ $len to point at the end 138 stmdb sp!,{r3-r11,lr} @ save $len/end too 139 sub r12,r12,#48 @ &rem_4bit 140 141 ldmia r12,{r4-r11} @ copy rem_4bit ... 142 stmdb sp!,{r4-r11} @ ... to stack 143 144 ldrb $nlo,[$inp,#15] 145 ldrb $nhi,[$Xi,#15] 146.Louter: 147 eor $nlo,$nlo,$nhi 148 and $nhi,$nlo,#0xf0 149 and $nlo,$nlo,#0x0f 150 mov $cnt,#14 151 152 add $Zhh,$Htbl,$nlo,lsl#4 153 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 154 add $Thh,$Htbl,$nhi 155 ldrb $nlo,[$inp,#14] 156 157 and $nhi,$Zll,#0xf @ rem 158 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 159 add $nhi,$nhi,$nhi 160 eor $Zll,$Tll,$Zll,lsr#4 161 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] 162 eor $Zll,$Zll,$Zlh,lsl#28 163 ldrb $nhi,[$Xi,#14] 164 eor $Zlh,$Tlh,$Zlh,lsr#4 165 eor $Zlh,$Zlh,$Zhl,lsl#28 166 eor $Zhl,$Thl,$Zhl,lsr#4 167 eor $Zhl,$Zhl,$Zhh,lsl#28 168 eor $Zhh,$Thh,$Zhh,lsr#4 169 eor $nlo,$nlo,$nhi 170 and $nhi,$nlo,#0xf0 171 and $nlo,$nlo,#0x0f 172 eor $Zhh,$Zhh,$Tll,lsl#16 173 174.Linner: 175 add $Thh,$Htbl,$nlo,lsl#4 176 and $nlo,$Zll,#0xf @ rem 177 subs $cnt,$cnt,#1 178 add $nlo,$nlo,$nlo 179 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 180 eor $Zll,$Tll,$Zll,lsr#4 181 eor $Zll,$Zll,$Zlh,lsl#28 182 eor $Zlh,$Tlh,$Zlh,lsr#4 183 eor $Zlh,$Zlh,$Zhl,lsl#28 184 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] 185 eor $Zhl,$Thl,$Zhl,lsr#4 186 ldrbpl $nlo,[$inp,$cnt] 187 eor $Zhl,$Zhl,$Zhh,lsl#28 188 eor $Zhh,$Thh,$Zhh,lsr#4 189 190 add $Thh,$Htbl,$nhi 191 and $nhi,$Zll,#0xf @ rem 192 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 193 add $nhi,$nhi,$nhi 194 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 195 eor $Zll,$Tll,$Zll,lsr#4 196 ldrbpl $Tll,[$Xi,$cnt] 197 eor $Zll,$Zll,$Zlh,lsl#28 198 eor $Zlh,$Tlh,$Zlh,lsr#4 199 ldrh $Tlh,[sp,$nhi] 200 eor $Zlh,$Zlh,$Zhl,lsl#28 201 eor $Zhl,$Thl,$Zhl,lsr#4 202 eor $Zhl,$Zhl,$Zhh,lsl#28 203 eorpl $nlo,$nlo,$Tll 204 eor $Zhh,$Thh,$Zhh,lsr#4 205 andpl $nhi,$nlo,#0xf0 206 andpl $nlo,$nlo,#0x0f 207 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] 208 bpl .Linner 209 210 ldr $len,[sp,#32] @ re-load $len/end 211 add $inp,$inp,#16 212 mov $nhi,$Zll 213___ 214 &Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]"); 215$code.=<<___; 216 bne .Louter 217 218 add sp,sp,#36 219#if __ARM_ARCH__>=5 220 ldmia sp!,{r4-r11,pc} 221#else 222 ldmia sp!,{r4-r11,lr} 223 tst lr,#1 224 moveq pc,lr @ be binary compatible with V4, yet 225 bx lr @ interoperable with Thumb ISA:-) 226#endif 227.size gcm_ghash_4bit,.-gcm_ghash_4bit 228 229.global gcm_gmult_4bit 230.type gcm_gmult_4bit,%function 231gcm_gmult_4bit: 232 stmdb sp!,{r4-r11,lr} 233 ldrb $nlo,[$Xi,#15] 234 b rem_4bit_get 235.Lrem_4bit_got: 236 and $nhi,$nlo,#0xf0 237 and $nlo,$nlo,#0x0f 238 mov $cnt,#14 239 240 add $Zhh,$Htbl,$nlo,lsl#4 241 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 242 ldrb $nlo,[$Xi,#14] 243 244 add $Thh,$Htbl,$nhi 245 and $nhi,$Zll,#0xf @ rem 246 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 247 add $nhi,$nhi,$nhi 248 eor $Zll,$Tll,$Zll,lsr#4 249 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 250 eor $Zll,$Zll,$Zlh,lsl#28 251 eor $Zlh,$Tlh,$Zlh,lsr#4 252 eor $Zlh,$Zlh,$Zhl,lsl#28 253 eor $Zhl,$Thl,$Zhl,lsr#4 254 eor $Zhl,$Zhl,$Zhh,lsl#28 255 eor $Zhh,$Thh,$Zhh,lsr#4 256 and $nhi,$nlo,#0xf0 257 eor $Zhh,$Zhh,$Tll,lsl#16 258 and $nlo,$nlo,#0x0f 259 260.Loop: 261 add $Thh,$Htbl,$nlo,lsl#4 262 and $nlo,$Zll,#0xf @ rem 263 subs $cnt,$cnt,#1 264 add $nlo,$nlo,$nlo 265 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 266 eor $Zll,$Tll,$Zll,lsr#4 267 eor $Zll,$Zll,$Zlh,lsl#28 268 eor $Zlh,$Tlh,$Zlh,lsr#4 269 eor $Zlh,$Zlh,$Zhl,lsl#28 270 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] 271 eor $Zhl,$Thl,$Zhl,lsr#4 272 ldrbpl $nlo,[$Xi,$cnt] 273 eor $Zhl,$Zhl,$Zhh,lsl#28 274 eor $Zhh,$Thh,$Zhh,lsr#4 275 276 add $Thh,$Htbl,$nhi 277 and $nhi,$Zll,#0xf @ rem 278 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 279 add $nhi,$nhi,$nhi 280 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 281 eor $Zll,$Tll,$Zll,lsr#4 282 eor $Zll,$Zll,$Zlh,lsl#28 283 eor $Zlh,$Tlh,$Zlh,lsr#4 284 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 285 eor $Zlh,$Zlh,$Zhl,lsl#28 286 eor $Zhl,$Thl,$Zhl,lsr#4 287 eor $Zhl,$Zhl,$Zhh,lsl#28 288 eor $Zhh,$Thh,$Zhh,lsr#4 289 andpl $nhi,$nlo,#0xf0 290 andpl $nlo,$nlo,#0x0f 291 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 292 bpl .Loop 293___ 294 &Zsmash(); 295$code.=<<___; 296#if __ARM_ARCH__>=5 297 ldmia sp!,{r4-r11,pc} 298#else 299 ldmia sp!,{r4-r11,lr} 300 tst lr,#1 301 moveq pc,lr @ be binary compatible with V4, yet 302 bx lr @ interoperable with Thumb ISA:-) 303#endif 304.size gcm_gmult_4bit,.-gcm_gmult_4bit 305___ 306{ 307my $cnt=$Htbl; # $Htbl is used once in the very beginning 308 309my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); 310my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); 311 312# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit 313# in Zo. Or should I say "top bit", because GHASH is specified in 314# reverse bit order? Otherwise straightforward 128-bt H by one input 315# byte multiplication and modulo-reduction, times 16. 316 317sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 318sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 319sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } 320 321$code.=<<___; 322#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT) 323.fpu neon 324 325.global gcm_gmult_neon 326.type gcm_gmult_neon,%function 327.align 4 328gcm_gmult_neon: 329 sub $Htbl,#16 @ point at H in GCM128_CTX 330 vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi 331 vmov.i32 $mod,#0xe1 @ our irreducible polynomial 332 vld1.64 `&Dlo("$IN")`,[$Xi,:64]! 333 vshr.u64 $mod,#32 334 vldmia $Htbl,{$Hhi-$Hlo} @ load H 335 veor $zero,$zero 336#ifdef __ARMEL__ 337 vrev64.8 $IN,$IN 338#endif 339 veor $Qpost,$Qpost 340 veor $R,$R 341 mov $cnt,#16 342 veor $Z,$Z 343 mov $len,#16 344 veor $Zo,$Zo 345 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte 346 b .Linner_neon 347.size gcm_gmult_neon,.-gcm_gmult_neon 348 349.global gcm_ghash_neon 350.type gcm_ghash_neon,%function 351.align 4 352gcm_ghash_neon: 353 vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi 354 vmov.i32 $mod,#0xe1 @ our irreducible polynomial 355 vld1.64 `&Dlo("$Z")`,[$Xi,:64]! 356 vshr.u64 $mod,#32 357 vldmia $Xi,{$Hhi-$Hlo} @ load H 358 veor $zero,$zero 359 nop 360#ifdef __ARMEL__ 361 vrev64.8 $Z,$Z 362#endif 363.Louter_neon: 364 vld1.64 `&Dhi($IN)`,[$inp]! @ load inp 365 veor $Qpost,$Qpost 366 vld1.64 `&Dlo($IN)`,[$inp]! 367 veor $R,$R 368 mov $cnt,#16 369#ifdef __ARMEL__ 370 vrev64.8 $IN,$IN 371#endif 372 veor $Zo,$Zo 373 veor $IN,$Z @ inp^=Xi 374 veor $Z,$Z 375 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte 376.Linner_neon: 377 subs $cnt,$cnt,#1 378 vmull.p8 $Qlo,$Hlo,$xi @ H.lo�Xi[i] 379 vmull.p8 $Qhi,$Hhi,$xi @ H.hi�Xi[i] 380 vext.8 $IN,$zero,#1 @ IN>>=8 381 382 veor $Z,$Qpost @ modulo-scheduled part 383 vshl.i64 `&Dlo("$R")`,#48 384 vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte 385 veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` 386 387 veor `&Dhi("$Z")`,`&Dlo("$R")` 388 vuzp.8 $Qlo,$Qhi 389 vsli.8 $Zo,$T,#1 @ compose the "carry" byte 390 vext.8 $Z,$zero,#1 @ Z>>=8 391 392 vmull.p8 $R,$Zo,$mod @ "carry"�0xe1 393 vshr.u8 $Zo,$T,#7 @ save Z's bottom bit 394 vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 395 veor $Z,$Qhi 396 bne .Linner_neon 397 398 veor $Z,$Qpost @ modulo-scheduled artefact 399 vshl.i64 `&Dlo("$R")`,#48 400 veor `&Dhi("$Z")`,`&Dlo("$R")` 401 402 @ finalization, normalize Z:Zo 403 vand $Zo,$mod @ suffices to mask the bit 404 vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 405 vshl.i64 $Z,#1 406 subs $len,#16 407 vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 408 bne .Louter_neon 409 410#ifdef __ARMEL__ 411 vrev64.8 $Z,$Z 412#endif 413 sub $Xi,#16 414 vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi 415 vst1.64 `&Dlo("$Z")`,[$Xi,:64] 416 417 bx lr 418.size gcm_ghash_neon,.-gcm_ghash_neon 419#endif 420___ 421} 422$code.=<<___; 423.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 424.align 2 425___ 426 427$code =~ s/\`([^\`]*)\`/eval $1/gem; 428$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 429print $code; 430close STDOUT; # enforce flush 431