1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# March 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. Performance 15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU 16# and are expressed in cycles per processed byte, less is better: 17# 18# gcc 3.3.x cc 5.2 this assembler 19# 20# 32-bit build 81.4 43.3 12.6 (+546%/+244%) 21# 64-bit build 20.2 21.2 12.6 (+60%/+68%) 22# 23# Here is data collected on UltraSPARC T1 system running Linux: 24# 25# gcc 4.4.1 this assembler 26# 27# 32-bit build 566 50 (+1000%) 28# 64-bit build 56 50 (+12%) 29# 30# I don't quite understand why difference between 32-bit and 64-bit 31# compiler-generated code is so big. Compilers *were* instructed to 32# generate code for UltraSPARC and should have used 64-bit registers 33# for Z vector (see C code) even in 32-bit build... Oh well, it only 34# means more impressive improvement coefficients for this assembler 35# module;-) Loops are aggressively modulo-scheduled in respect to 36# references to input data and Z.hi updates to achieve 12 cycles 37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. 39 40$bits=32; 41for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 42if ($bits==64) { $bias=2047; $frame=192; } 43else { $bias=0; $frame=112; } 44 45$output=shift; 46open STDOUT,">$output"; 47 48$Zhi="%o0"; # 64-bit values 49$Zlo="%o1"; 50$Thi="%o2"; 51$Tlo="%o3"; 52$rem="%o4"; 53$tmp="%o5"; 54 55$nhi="%l0"; # small values and pointers 56$nlo="%l1"; 57$xi0="%l2"; 58$xi1="%l3"; 59$rem_4bit="%l4"; 60$remi="%l5"; 61$Htblo="%l6"; 62$cnt="%l7"; 63 64$Xi="%i0"; # input argument block 65$Htbl="%i1"; 66$inp="%i2"; 67$len="%i3"; 68 69$code.=<<___; 70.section ".rodata",#alloc 71 72.align 64 73rem_4bit: 74 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 75 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 76 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 77 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 78.type rem_4bit,#object 79.size rem_4bit,(.-rem_4bit) 80 81.section ".text",#alloc,#execinstr 82.globl gcm_ghash_4bit 83.align 32 84gcm_ghash_4bit: 85 save %sp,-$frame,%sp 86#ifdef __PIC__ 87 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), $tmp 88 rd %pc, $rem 89 or $tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp 90 add $tmp, $rem, $tmp 91#endif 92 93 ldub [$inp+15],$nlo 94 ldub [$Xi+15],$xi0 95 ldub [$Xi+14],$xi1 96 add $len,$inp,$len 97 add $Htbl,8,$Htblo 98 99#ifdef __PIC__ 100 set rem_4bit, $rem_4bit 101 ldx [$rem_4bit+$tmp], $rem_4bit 102#else 103 set rem_4bit, $rem_4bit 104#endif 105 106.Louter: 107 xor $xi0,$nlo,$nlo 108 and $nlo,0xf0,$nhi 109 and $nlo,0x0f,$nlo 110 sll $nlo,4,$nlo 111 ldx [$Htblo+$nlo],$Zlo 112 ldx [$Htbl+$nlo],$Zhi 113 114 ldub [$inp+14],$nlo 115 116 ldx [$Htblo+$nhi],$Tlo 117 and $Zlo,0xf,$remi 118 ldx [$Htbl+$nhi],$Thi 119 sll $remi,3,$remi 120 ldx [$rem_4bit+$remi],$rem 121 srlx $Zlo,4,$Zlo 122 mov 13,$cnt 123 sllx $Zhi,60,$tmp 124 xor $Tlo,$Zlo,$Zlo 125 srlx $Zhi,4,$Zhi 126 xor $Zlo,$tmp,$Zlo 127 128 xor $xi1,$nlo,$nlo 129 and $Zlo,0xf,$remi 130 and $nlo,0xf0,$nhi 131 and $nlo,0x0f,$nlo 132 ba .Lghash_inner 133 sll $nlo,4,$nlo 134.align 32 135.Lghash_inner: 136 ldx [$Htblo+$nlo],$Tlo 137 sll $remi,3,$remi 138 xor $Thi,$Zhi,$Zhi 139 ldx [$Htbl+$nlo],$Thi 140 srlx $Zlo,4,$Zlo 141 xor $rem,$Zhi,$Zhi 142 ldx [$rem_4bit+$remi],$rem 143 sllx $Zhi,60,$tmp 144 xor $Tlo,$Zlo,$Zlo 145 ldub [$inp+$cnt],$nlo 146 srlx $Zhi,4,$Zhi 147 xor $Zlo,$tmp,$Zlo 148 ldub [$Xi+$cnt],$xi1 149 xor $Thi,$Zhi,$Zhi 150 and $Zlo,0xf,$remi 151 152 ldx [$Htblo+$nhi],$Tlo 153 sll $remi,3,$remi 154 xor $rem,$Zhi,$Zhi 155 ldx [$Htbl+$nhi],$Thi 156 srlx $Zlo,4,$Zlo 157 ldx [$rem_4bit+$remi],$rem 158 sllx $Zhi,60,$tmp 159 xor $xi1,$nlo,$nlo 160 srlx $Zhi,4,$Zhi 161 and $nlo,0xf0,$nhi 162 addcc $cnt,-1,$cnt 163 xor $Zlo,$tmp,$Zlo 164 and $nlo,0x0f,$nlo 165 xor $Tlo,$Zlo,$Zlo 166 sll $nlo,4,$nlo 167 blu .Lghash_inner 168 and $Zlo,0xf,$remi 169 170 ldx [$Htblo+$nlo],$Tlo 171 sll $remi,3,$remi 172 xor $Thi,$Zhi,$Zhi 173 ldx [$Htbl+$nlo],$Thi 174 srlx $Zlo,4,$Zlo 175 xor $rem,$Zhi,$Zhi 176 ldx [$rem_4bit+$remi],$rem 177 sllx $Zhi,60,$tmp 178 xor $Tlo,$Zlo,$Zlo 179 srlx $Zhi,4,$Zhi 180 xor $Zlo,$tmp,$Zlo 181 xor $Thi,$Zhi,$Zhi 182 183 add $inp,16,$inp 184 cmp $inp,$len 185 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone 186 and $Zlo,0xf,$remi 187 188 ldx [$Htblo+$nhi],$Tlo 189 sll $remi,3,$remi 190 xor $rem,$Zhi,$Zhi 191 ldx [$Htbl+$nhi],$Thi 192 srlx $Zlo,4,$Zlo 193 ldx [$rem_4bit+$remi],$rem 194 sllx $Zhi,60,$tmp 195 xor $Tlo,$Zlo,$Zlo 196 ldub [$inp+15],$nlo 197 srlx $Zhi,4,$Zhi 198 xor $Zlo,$tmp,$Zlo 199 xor $Thi,$Zhi,$Zhi 200 stx $Zlo,[$Xi+8] 201 xor $rem,$Zhi,$Zhi 202 stx $Zhi,[$Xi] 203 srl $Zlo,8,$xi1 204 and $Zlo,0xff,$xi0 205 ba .Louter 206 and $xi1,0xff,$xi1 207.align 32 208.Ldone: 209 ldx [$Htblo+$nhi],$Tlo 210 sll $remi,3,$remi 211 xor $rem,$Zhi,$Zhi 212 ldx [$Htbl+$nhi],$Thi 213 srlx $Zlo,4,$Zlo 214 ldx [$rem_4bit+$remi],$rem 215 sllx $Zhi,60,$tmp 216 xor $Tlo,$Zlo,$Zlo 217 srlx $Zhi,4,$Zhi 218 xor $Zlo,$tmp,$Zlo 219 xor $Thi,$Zhi,$Zhi 220 stx $Zlo,[$Xi+8] 221 xor $rem,$Zhi,$Zhi 222 stx $Zhi,[$Xi] 223 224 ret 225 restore 226.type gcm_ghash_4bit,#function 227.size gcm_ghash_4bit,(.-gcm_ghash_4bit) 228___ 229 230undef $inp; 231undef $len; 232 233$code.=<<___; 234.globl gcm_gmult_4bit 235.align 32 236gcm_gmult_4bit: 237 save %sp,-$frame,%sp 238#ifdef __PIC__ 239 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), $tmp 240 rd %pc, $rem 241 or $tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp 242 add $tmp, $rem, $tmp 243#endif 244 245 ldub [$Xi+15],$nlo 246 add $Htbl,8,$Htblo 247 248#ifdef __PIC__ 249 set rem_4bit, $rem_4bit 250 ldx [$rem_4bit+$tmp], $rem_4bit 251#else 252 set rem_4bit, $rem_4bit 253#endif 254 255 and $nlo,0xf0,$nhi 256 and $nlo,0x0f,$nlo 257 sll $nlo,4,$nlo 258 ldx [$Htblo+$nlo],$Zlo 259 ldx [$Htbl+$nlo],$Zhi 260 261 ldub [$Xi+14],$nlo 262 263 ldx [$Htblo+$nhi],$Tlo 264 and $Zlo,0xf,$remi 265 ldx [$Htbl+$nhi],$Thi 266 sll $remi,3,$remi 267 ldx [$rem_4bit+$remi],$rem 268 srlx $Zlo,4,$Zlo 269 mov 13,$cnt 270 sllx $Zhi,60,$tmp 271 xor $Tlo,$Zlo,$Zlo 272 srlx $Zhi,4,$Zhi 273 xor $Zlo,$tmp,$Zlo 274 275 and $Zlo,0xf,$remi 276 and $nlo,0xf0,$nhi 277 and $nlo,0x0f,$nlo 278 ba .Lgmult_inner 279 sll $nlo,4,$nlo 280.align 32 281.Lgmult_inner: 282 ldx [$Htblo+$nlo],$Tlo 283 sll $remi,3,$remi 284 xor $Thi,$Zhi,$Zhi 285 ldx [$Htbl+$nlo],$Thi 286 srlx $Zlo,4,$Zlo 287 xor $rem,$Zhi,$Zhi 288 ldx [$rem_4bit+$remi],$rem 289 sllx $Zhi,60,$tmp 290 xor $Tlo,$Zlo,$Zlo 291 ldub [$Xi+$cnt],$nlo 292 srlx $Zhi,4,$Zhi 293 xor $Zlo,$tmp,$Zlo 294 xor $Thi,$Zhi,$Zhi 295 and $Zlo,0xf,$remi 296 297 ldx [$Htblo+$nhi],$Tlo 298 sll $remi,3,$remi 299 xor $rem,$Zhi,$Zhi 300 ldx [$Htbl+$nhi],$Thi 301 srlx $Zlo,4,$Zlo 302 ldx [$rem_4bit+$remi],$rem 303 sllx $Zhi,60,$tmp 304 srlx $Zhi,4,$Zhi 305 and $nlo,0xf0,$nhi 306 addcc $cnt,-1,$cnt 307 xor $Zlo,$tmp,$Zlo 308 and $nlo,0x0f,$nlo 309 xor $Tlo,$Zlo,$Zlo 310 sll $nlo,4,$nlo 311 blu .Lgmult_inner 312 and $Zlo,0xf,$remi 313 314 ldx [$Htblo+$nlo],$Tlo 315 sll $remi,3,$remi 316 xor $Thi,$Zhi,$Zhi 317 ldx [$Htbl+$nlo],$Thi 318 srlx $Zlo,4,$Zlo 319 xor $rem,$Zhi,$Zhi 320 ldx [$rem_4bit+$remi],$rem 321 sllx $Zhi,60,$tmp 322 xor $Tlo,$Zlo,$Zlo 323 srlx $Zhi,4,$Zhi 324 xor $Zlo,$tmp,$Zlo 325 xor $Thi,$Zhi,$Zhi 326 and $Zlo,0xf,$remi 327 328 ldx [$Htblo+$nhi],$Tlo 329 sll $remi,3,$remi 330 xor $rem,$Zhi,$Zhi 331 ldx [$Htbl+$nhi],$Thi 332 srlx $Zlo,4,$Zlo 333 ldx [$rem_4bit+$remi],$rem 334 sllx $Zhi,60,$tmp 335 xor $Tlo,$Zlo,$Zlo 336 srlx $Zhi,4,$Zhi 337 xor $Zlo,$tmp,$Zlo 338 xor $Thi,$Zhi,$Zhi 339 stx $Zlo,[$Xi+8] 340 xor $rem,$Zhi,$Zhi 341 stx $Zhi,[$Xi] 342 343 ret 344 restore 345.type gcm_gmult_4bit,#function 346.size gcm_gmult_4bit,(.-gcm_gmult_4bit) 347___ 348 349$code =~ s/\`([^\`]*)\`/eval $1/gem; 350print $code; 351close STDOUT; 352