1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# On PA-7100LC this module performs ~90-50% better, less for longer 11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12# that compiler utilized xmpyu instruction to perform 32x32=64-bit 13# multiplication, which in turn means that "baseline" performance was 14# optimal in respect to instruction set capabilities. Fair comparison 15# with vendor compiler is problematic, because OpenSSL doesn't define 16# BN_LLONG [presumably] for historical reasons, which drives compiler 17# toward 4 times 16x16=32-bit multiplicatons [plus complementary 18# shifts and additions] instead. This means that you should observe 19# several times improvement over code generated by vendor compiler 20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21# improvement coefficient was never collected on PA-7100LC, or any 22# other 1.1 CPU, because I don't have access to such machine with 23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25# of ~5x on PA-8600. 26# 27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28# reportedly ~2x faster than vendor compiler generated code [according 29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30# this implementation is actually 32-bit one, in the sense that it 31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32# 64-bit BN_LONGs... How do they interoperate then? No problem. This 33# module picks halves of 64-bit values in reverse order and pretends 34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37# i.e. there is no "wider" multiplication like on most other 64-bit 38# platforms. This means that even being effectively 32-bit, this 39# implementation performs "64-bit" computational task in same amount 40# of arithmetic operations, most notably multiplications. It requires 41# more memory references, most notably to tp[num], but this doesn't 42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43# 2.0 code path provides virtually same performance as pa-risc2[W].s: 44# it's ~10% better for shortest key length and ~10% worse for longest 45# one. 46# 47# In case it wasn't clear. The module has two distinct code paths: 48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49# additions and 64-bit integer loads, not to mention specific 50# instruction scheduling. In 64-bit build naturally only 2.0 code path 51# is assembled. In 32-bit application context both code paths are 52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53# is taken automatically. Also, in 32-bit build the module imposes 54# couple of limitations: vector lengths has to be even and vector 55# addresses has to be 64-bit aligned. Normally neither is a problem: 56# most common key lengths are even and vectors are commonly malloc-ed, 57# which ensures alignment. 58# 59# Special thanks to polarhome.com for providing HP-UX account on 60# PA-RISC 1.1 machine, and to correspondent who chose to remain 61# anonymous for testing the code on PA-RISC 2.0 machine. 62 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64 65$flavour = shift; 66$output = shift; 67 68open STDOUT,">$output"; 69 70if ($flavour =~ /64/) { 71 $LEVEL ="2.0W"; 72 $SIZE_T =8; 73 $FRAME_MARKER =80; 74 $SAVED_RP =16; 75 $PUSH ="std"; 76 $PUSHMA ="std,ma"; 77 $POP ="ldd"; 78 $POPMB ="ldd,mb"; 79 $BN_SZ =$SIZE_T; 80} else { 81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 82 $SIZE_T =4; 83 $FRAME_MARKER =48; 84 $SAVED_RP =20; 85 $PUSH ="stw"; 86 $PUSHMA ="stwm"; 87 $POP ="ldw"; 88 $POPMB ="ldwm"; 89 $BN_SZ =$SIZE_T; 90} 91 92$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 93 # [+ argument transfer] 94$LOCALS=$FRAME-$FRAME_MARKER; 95$FRAME+=32; # local variables 96 97$tp="%r31"; 98$ti1="%r29"; 99$ti0="%r28"; 100 101$rp="%r26"; 102$ap="%r25"; 103$bp="%r24"; 104$np="%r23"; 105$n0="%r22"; # passed through stack in 32-bit 106$num="%r21"; # passed through stack in 32-bit 107$idx="%r20"; 108$arrsz="%r19"; 109 110$nm1="%r7"; 111$nm0="%r6"; 112$ab1="%r5"; 113$ab0="%r4"; 114 115$fp="%r3"; 116$hi1="%r2"; 117$hi0="%r1"; 118 119$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s 120 121$fm0="%fr4"; $fti=$fm0; 122$fbi="%fr5L"; 123$fn0="%fr5R"; 124$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 125$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 126 127$code=<<___; 128 .LEVEL $LEVEL 129 .text 130 131 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 132 .ALIGN 64 133bn_mul_mont 134 .PROC 135 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 136 .ENTRY 137 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 138 $PUSHMA %r3,$FRAME(%sp) 139 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 140 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 141 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 142 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 143 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 144 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 145 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 146 ldo -$FRAME(%sp),$fp 147___ 148$code.=<<___ if ($SIZE_T==4); 149 ldw `-$FRAME_MARKER-4`($fp),$n0 150 ldw `-$FRAME_MARKER-8`($fp),$num 151 nop 152 nop ; alignment 153___ 154$code.=<<___ if ($BN_SZ==4); 155 comiclr,<= 6,$num,%r0 ; are vectors long enough? 156 b L\$abort 157 ldi 0,%r28 ; signal "unhandled" 158 add,ev %r0,$num,$num ; is $num even? 159 b L\$abort 160 nop 161 or $ap,$np,$ti1 162 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 163 b L\$abort 164 nop 165 nop ; alignment 166 nop 167 168 fldws 0($n0),${fn0} 169 fldws,ma 4($bp),${fbi} ; bp[0] 170___ 171$code.=<<___ if ($BN_SZ==8); 172 comib,> 3,$num,L\$abort ; are vectors long enough? 173 ldi 0,%r28 ; signal "unhandled" 174 addl $num,$num,$num ; I operate on 32-bit values 175 176 fldws 4($n0),${fn0} ; only low part of n0 177 fldws 4($bp),${fbi} ; bp[0] in flipped word order 178___ 179$code.=<<___; 180 fldds 0($ap),${fai} ; ap[0,1] 181 fldds 0($np),${fni} ; np[0,1] 182 183 sh2addl $num,%r0,$arrsz 184 ldi 31,$hi0 185 ldo 36($arrsz),$hi1 ; space for tp[num+1] 186 andcm $hi1,$hi0,$hi1 ; align 187 addl $hi1,%sp,%sp 188 $PUSH $fp,-$SIZE_T(%sp) 189 190 ldo `$LOCALS+16`($fp),$xfer 191 ldo `$LOCALS+32+4`($fp),$tp 192 193 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 194 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 195 xmpyu ${fn0},${fab0}R,${fm0} 196 197 addl $arrsz,$ap,$ap ; point at the end 198 addl $arrsz,$np,$np 199 subi 0,$arrsz,$idx ; j=0 200 ldo 8($idx),$idx ; j++++ 201 202 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 203 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 204 fstds ${fab0},-16($xfer) 205 fstds ${fnm0},-8($xfer) 206 fstds ${fab1},0($xfer) 207 fstds ${fnm1},8($xfer) 208 flddx $idx($ap),${fai} ; ap[2,3] 209 flddx $idx($np),${fni} ; np[2,3] 210___ 211$code.=<<___ if ($BN_SZ==4); 212#ifdef __LP64__ 213 mtctl $hi0,%cr11 ; $hi0 still holds 31 214 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 215 b L\$parisc11 216 nop 217___ 218$code.=<<___; # PA-RISC 2.0 code-path 219 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 220 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 221 ldd -16($xfer),$ab0 222 fstds ${fab0},-16($xfer) 223 224 extrd,u $ab0,31,32,$hi0 225 extrd,u $ab0,63,32,$ab0 226 ldd -8($xfer),$nm0 227 fstds ${fnm0},-8($xfer) 228 ldo 8($idx),$idx ; j++++ 229 addl $ab0,$nm0,$nm0 ; low part is discarded 230 extrd,u $nm0,31,32,$hi1 231 232L\$1st 233 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 234 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 235 ldd 0($xfer),$ab1 236 fstds ${fab1},0($xfer) 237 addl $hi0,$ab1,$ab1 238 extrd,u $ab1,31,32,$hi0 239 ldd 8($xfer),$nm1 240 fstds ${fnm1},8($xfer) 241 extrd,u $ab1,63,32,$ab1 242 addl $hi1,$nm1,$nm1 243 flddx $idx($ap),${fai} ; ap[j,j+1] 244 flddx $idx($np),${fni} ; np[j,j+1] 245 addl $ab1,$nm1,$nm1 246 extrd,u $nm1,31,32,$hi1 247 248 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 249 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 250 ldd -16($xfer),$ab0 251 fstds ${fab0},-16($xfer) 252 addl $hi0,$ab0,$ab0 253 extrd,u $ab0,31,32,$hi0 254 ldd -8($xfer),$nm0 255 fstds ${fnm0},-8($xfer) 256 extrd,u $ab0,63,32,$ab0 257 addl $hi1,$nm0,$nm0 258 stw $nm1,-4($tp) ; tp[j-1] 259 addl $ab0,$nm0,$nm0 260 stw,ma $nm0,8($tp) ; tp[j-1] 261 addib,<> 8,$idx,L\$1st ; j++++ 262 extrd,u $nm0,31,32,$hi1 263 264 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 265 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 266 ldd 0($xfer),$ab1 267 fstds ${fab1},0($xfer) 268 addl $hi0,$ab1,$ab1 269 extrd,u $ab1,31,32,$hi0 270 ldd 8($xfer),$nm1 271 fstds ${fnm1},8($xfer) 272 extrd,u $ab1,63,32,$ab1 273 addl $hi1,$nm1,$nm1 274 ldd -16($xfer),$ab0 275 addl $ab1,$nm1,$nm1 276 ldd -8($xfer),$nm0 277 extrd,u $nm1,31,32,$hi1 278 279 addl $hi0,$ab0,$ab0 280 extrd,u $ab0,31,32,$hi0 281 stw $nm1,-4($tp) ; tp[j-1] 282 extrd,u $ab0,63,32,$ab0 283 addl $hi1,$nm0,$nm0 284 ldd 0($xfer),$ab1 285 addl $ab0,$nm0,$nm0 286 ldd,mb 8($xfer),$nm1 287 extrd,u $nm0,31,32,$hi1 288 stw,ma $nm0,8($tp) ; tp[j-1] 289 290 ldo -1($num),$num ; i-- 291 subi 0,$arrsz,$idx ; j=0 292___ 293$code.=<<___ if ($BN_SZ==4); 294 fldws,ma 4($bp),${fbi} ; bp[1] 295___ 296$code.=<<___ if ($BN_SZ==8); 297 fldws 0($bp),${fbi} ; bp[1] in flipped word order 298___ 299$code.=<<___; 300 flddx $idx($ap),${fai} ; ap[0,1] 301 flddx $idx($np),${fni} ; np[0,1] 302 fldws 8($xfer),${fti}R ; tp[0] 303 addl $hi0,$ab1,$ab1 304 extrd,u $ab1,31,32,$hi0 305 extrd,u $ab1,63,32,$ab1 306 ldo 8($idx),$idx ; j++++ 307 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 308 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 309 addl $hi1,$nm1,$nm1 310 addl $ab1,$nm1,$nm1 311 extrd,u $nm1,31,32,$hi1 312 fstws,mb ${fab0}L,-8($xfer) ; save high part 313 stw $nm1,-4($tp) ; tp[j-1] 314 315 fcpy,sgl %fr0,${fti}L ; zero high part 316 fcpy,sgl %fr0,${fab0}L 317 addl $hi1,$hi0,$hi0 318 extrd,u $hi0,31,32,$hi1 319 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 320 fcnvxf,dbl,dbl ${fab0},${fab0} 321 stw $hi0,0($tp) 322 stw $hi1,4($tp) 323 324 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 325 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 326 xmpyu ${fn0},${fab0}R,${fm0} 327 ldo `$LOCALS+32+4`($fp),$tp 328L\$outer 329 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 330 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 331 fstds ${fab0},-16($xfer) ; 33-bit value 332 fstds ${fnm0},-8($xfer) 333 flddx $idx($ap),${fai} ; ap[2] 334 flddx $idx($np),${fni} ; np[2] 335 ldo 8($idx),$idx ; j++++ 336 ldd -16($xfer),$ab0 ; 33-bit value 337 ldd -8($xfer),$nm0 338 ldw 0($xfer),$hi0 ; high part 339 340 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 341 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 342 extrd,u $ab0,31,32,$ti0 ; carry bit 343 extrd,u $ab0,63,32,$ab0 344 fstds ${fab1},0($xfer) 345 addl $ti0,$hi0,$hi0 ; account carry bit 346 fstds ${fnm1},8($xfer) 347 addl $ab0,$nm0,$nm0 ; low part is discarded 348 ldw 0($tp),$ti1 ; tp[1] 349 extrd,u $nm0,31,32,$hi1 350 fstds ${fab0},-16($xfer) 351 fstds ${fnm0},-8($xfer) 352 353L\$inner 354 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 355 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 356 ldd 0($xfer),$ab1 357 fstds ${fab1},0($xfer) 358 addl $hi0,$ti1,$ti1 359 addl $ti1,$ab1,$ab1 360 ldd 8($xfer),$nm1 361 fstds ${fnm1},8($xfer) 362 extrd,u $ab1,31,32,$hi0 363 extrd,u $ab1,63,32,$ab1 364 flddx $idx($ap),${fai} ; ap[j,j+1] 365 flddx $idx($np),${fni} ; np[j,j+1] 366 addl $hi1,$nm1,$nm1 367 addl $ab1,$nm1,$nm1 368 ldw 4($tp),$ti0 ; tp[j] 369 stw $nm1,-4($tp) ; tp[j-1] 370 371 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 372 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 373 ldd -16($xfer),$ab0 374 fstds ${fab0},-16($xfer) 375 addl $hi0,$ti0,$ti0 376 addl $ti0,$ab0,$ab0 377 ldd -8($xfer),$nm0 378 fstds ${fnm0},-8($xfer) 379 extrd,u $ab0,31,32,$hi0 380 extrd,u $nm1,31,32,$hi1 381 ldw 8($tp),$ti1 ; tp[j] 382 extrd,u $ab0,63,32,$ab0 383 addl $hi1,$nm0,$nm0 384 addl $ab0,$nm0,$nm0 385 stw,ma $nm0,8($tp) ; tp[j-1] 386 addib,<> 8,$idx,L\$inner ; j++++ 387 extrd,u $nm0,31,32,$hi1 388 389 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 390 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 391 ldd 0($xfer),$ab1 392 fstds ${fab1},0($xfer) 393 addl $hi0,$ti1,$ti1 394 addl $ti1,$ab1,$ab1 395 ldd 8($xfer),$nm1 396 fstds ${fnm1},8($xfer) 397 extrd,u $ab1,31,32,$hi0 398 extrd,u $ab1,63,32,$ab1 399 ldw 4($tp),$ti0 ; tp[j] 400 addl $hi1,$nm1,$nm1 401 addl $ab1,$nm1,$nm1 402 ldd -16($xfer),$ab0 403 ldd -8($xfer),$nm0 404 extrd,u $nm1,31,32,$hi1 405 406 addl $hi0,$ab0,$ab0 407 addl $ti0,$ab0,$ab0 408 stw $nm1,-4($tp) ; tp[j-1] 409 extrd,u $ab0,31,32,$hi0 410 ldw 8($tp),$ti1 ; tp[j] 411 extrd,u $ab0,63,32,$ab0 412 addl $hi1,$nm0,$nm0 413 ldd 0($xfer),$ab1 414 addl $ab0,$nm0,$nm0 415 ldd,mb 8($xfer),$nm1 416 extrd,u $nm0,31,32,$hi1 417 stw,ma $nm0,8($tp) ; tp[j-1] 418 419 addib,= -1,$num,L\$outerdone ; i-- 420 subi 0,$arrsz,$idx ; j=0 421___ 422$code.=<<___ if ($BN_SZ==4); 423 fldws,ma 4($bp),${fbi} ; bp[i] 424___ 425$code.=<<___ if ($BN_SZ==8); 426 ldi 12,$ti0 ; bp[i] in flipped word order 427 addl,ev %r0,$num,$num 428 ldi -4,$ti0 429 addl $ti0,$bp,$bp 430 fldws 0($bp),${fbi} 431___ 432$code.=<<___; 433 flddx $idx($ap),${fai} ; ap[0] 434 addl $hi0,$ab1,$ab1 435 flddx $idx($np),${fni} ; np[0] 436 fldws 8($xfer),${fti}R ; tp[0] 437 addl $ti1,$ab1,$ab1 438 extrd,u $ab1,31,32,$hi0 439 extrd,u $ab1,63,32,$ab1 440 441 ldo 8($idx),$idx ; j++++ 442 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 443 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 444 ldw 4($tp),$ti0 ; tp[j] 445 446 addl $hi1,$nm1,$nm1 447 fstws,mb ${fab0}L,-8($xfer) ; save high part 448 addl $ab1,$nm1,$nm1 449 extrd,u $nm1,31,32,$hi1 450 fcpy,sgl %fr0,${fti}L ; zero high part 451 fcpy,sgl %fr0,${fab0}L 452 stw $nm1,-4($tp) ; tp[j-1] 453 454 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 455 fcnvxf,dbl,dbl ${fab0},${fab0} 456 addl $hi1,$hi0,$hi0 457 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 458 addl $ti0,$hi0,$hi0 459 extrd,u $hi0,31,32,$hi1 460 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 461 stw $hi0,0($tp) 462 stw $hi1,4($tp) 463 xmpyu ${fn0},${fab0}R,${fm0} 464 465 b L\$outer 466 ldo `$LOCALS+32+4`($fp),$tp 467 468L\$outerdone 469 addl $hi0,$ab1,$ab1 470 addl $ti1,$ab1,$ab1 471 extrd,u $ab1,31,32,$hi0 472 extrd,u $ab1,63,32,$ab1 473 474 ldw 4($tp),$ti0 ; tp[j] 475 476 addl $hi1,$nm1,$nm1 477 addl $ab1,$nm1,$nm1 478 extrd,u $nm1,31,32,$hi1 479 stw $nm1,-4($tp) ; tp[j-1] 480 481 addl $hi1,$hi0,$hi0 482 addl $ti0,$hi0,$hi0 483 extrd,u $hi0,31,32,$hi1 484 stw $hi0,0($tp) 485 stw $hi1,4($tp) 486 487 ldo `$LOCALS+32`($fp),$tp 488 sub %r0,%r0,%r0 ; clear borrow 489___ 490$code.=<<___ if ($BN_SZ==4); 491 ldws,ma 4($tp),$ti0 492 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 493 b L\$sub_pa11 494 addl $tp,$arrsz,$tp 495L\$sub 496 ldwx $idx($np),$hi0 497 subb $ti0,$hi0,$hi1 498 ldwx $idx($tp),$ti0 499 addib,<> 4,$idx,L\$sub 500 stws,ma $hi1,4($rp) 501 502 subb $ti0,%r0,$hi1 503 ldo -4($tp),$tp 504___ 505$code.=<<___ if ($BN_SZ==8); 506 ldd,ma 8($tp),$ti0 507L\$sub 508 ldd $idx($np),$hi0 509 shrpd $ti0,$ti0,32,$ti0 ; flip word order 510 std $ti0,-8($tp) ; save flipped value 511 sub,db $ti0,$hi0,$hi1 512 ldd,ma 8($tp),$ti0 513 addib,<> 8,$idx,L\$sub 514 std,ma $hi1,8($rp) 515 516 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 517 sub,db $ti0,%r0,$hi1 518 ldo -8($tp),$tp 519___ 520$code.=<<___; 521 and $tp,$hi1,$ap 522 andcm $rp,$hi1,$bp 523 or $ap,$bp,$np 524 525 sub $rp,$arrsz,$rp ; rewind rp 526 subi 0,$arrsz,$idx 527 ldo `$LOCALS+32`($fp),$tp 528L\$copy 529 ldd $idx($np),$hi0 530 std,ma %r0,8($tp) 531 addib,<> 8,$idx,.-8 ; L\$copy 532 std,ma $hi0,8($rp) 533___ 534 535if ($BN_SZ==4) { # PA-RISC 1.1 code-path 536$ablo=$ab0; 537$abhi=$ab1; 538$nmlo0=$nm0; 539$nmhi0=$nm1; 540$nmlo1="%r9"; 541$nmhi1="%r8"; 542 543$code.=<<___; 544 b L\$done 545 nop 546 547 .ALIGN 8 548L\$parisc11 549#endif 550 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 551 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 552 ldw -12($xfer),$ablo 553 ldw -16($xfer),$hi0 554 ldw -4($xfer),$nmlo0 555 ldw -8($xfer),$nmhi0 556 fstds ${fab0},-16($xfer) 557 fstds ${fnm0},-8($xfer) 558 559 ldo 8($idx),$idx ; j++++ 560 add $ablo,$nmlo0,$nmlo0 ; discarded 561 addc %r0,$nmhi0,$hi1 562 ldw 4($xfer),$ablo 563 ldw 0($xfer),$abhi 564 nop 565 566L\$1st_pa11 567 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 568 flddx $idx($ap),${fai} ; ap[j,j+1] 569 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 570 flddx $idx($np),${fni} ; np[j,j+1] 571 add $hi0,$ablo,$ablo 572 ldw 12($xfer),$nmlo1 573 addc %r0,$abhi,$hi0 574 ldw 8($xfer),$nmhi1 575 add $ablo,$nmlo1,$nmlo1 576 fstds ${fab1},0($xfer) 577 addc %r0,$nmhi1,$nmhi1 578 fstds ${fnm1},8($xfer) 579 add $hi1,$nmlo1,$nmlo1 580 ldw -12($xfer),$ablo 581 addc %r0,$nmhi1,$hi1 582 ldw -16($xfer),$abhi 583 584 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 585 ldw -4($xfer),$nmlo0 586 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 587 ldw -8($xfer),$nmhi0 588 add $hi0,$ablo,$ablo 589 stw $nmlo1,-4($tp) ; tp[j-1] 590 addc %r0,$abhi,$hi0 591 fstds ${fab0},-16($xfer) 592 add $ablo,$nmlo0,$nmlo0 593 fstds ${fnm0},-8($xfer) 594 addc %r0,$nmhi0,$nmhi0 595 ldw 0($xfer),$abhi 596 add $hi1,$nmlo0,$nmlo0 597 ldw 4($xfer),$ablo 598 stws,ma $nmlo0,8($tp) ; tp[j-1] 599 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 600 addc %r0,$nmhi0,$hi1 601 602 ldw 8($xfer),$nmhi1 603 ldw 12($xfer),$nmlo1 604 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 605 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 606 add $hi0,$ablo,$ablo 607 fstds ${fab1},0($xfer) 608 addc %r0,$abhi,$hi0 609 fstds ${fnm1},8($xfer) 610 add $ablo,$nmlo1,$nmlo1 611 ldw -16($xfer),$abhi 612 addc %r0,$nmhi1,$nmhi1 613 ldw -12($xfer),$ablo 614 add $hi1,$nmlo1,$nmlo1 615 ldw -8($xfer),$nmhi0 616 addc %r0,$nmhi1,$hi1 617 ldw -4($xfer),$nmlo0 618 619 add $hi0,$ablo,$ablo 620 stw $nmlo1,-4($tp) ; tp[j-1] 621 addc %r0,$abhi,$hi0 622 ldw 0($xfer),$abhi 623 add $ablo,$nmlo0,$nmlo0 624 ldw 4($xfer),$ablo 625 addc %r0,$nmhi0,$nmhi0 626 ldws,mb 8($xfer),$nmhi1 627 add $hi1,$nmlo0,$nmlo0 628 ldw 4($xfer),$nmlo1 629 addc %r0,$nmhi0,$hi1 630 stws,ma $nmlo0,8($tp) ; tp[j-1] 631 632 ldo -1($num),$num ; i-- 633 subi 0,$arrsz,$idx ; j=0 634 635 fldws,ma 4($bp),${fbi} ; bp[1] 636 flddx $idx($ap),${fai} ; ap[0,1] 637 flddx $idx($np),${fni} ; np[0,1] 638 fldws 8($xfer),${fti}R ; tp[0] 639 add $hi0,$ablo,$ablo 640 addc %r0,$abhi,$hi0 641 ldo 8($idx),$idx ; j++++ 642 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 643 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 644 add $hi1,$nmlo1,$nmlo1 645 addc %r0,$nmhi1,$nmhi1 646 add $ablo,$nmlo1,$nmlo1 647 addc %r0,$nmhi1,$hi1 648 fstws,mb ${fab0}L,-8($xfer) ; save high part 649 stw $nmlo1,-4($tp) ; tp[j-1] 650 651 fcpy,sgl %fr0,${fti}L ; zero high part 652 fcpy,sgl %fr0,${fab0}L 653 add $hi1,$hi0,$hi0 654 addc %r0,%r0,$hi1 655 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 656 fcnvxf,dbl,dbl ${fab0},${fab0} 657 stw $hi0,0($tp) 658 stw $hi1,4($tp) 659 660 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 661 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 662 xmpyu ${fn0},${fab0}R,${fm0} 663 ldo `$LOCALS+32+4`($fp),$tp 664L\$outer_pa11 665 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 666 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 667 fstds ${fab0},-16($xfer) ; 33-bit value 668 fstds ${fnm0},-8($xfer) 669 flddx $idx($ap),${fai} ; ap[2,3] 670 flddx $idx($np),${fni} ; np[2,3] 671 ldw -16($xfer),$abhi ; carry bit actually 672 ldo 8($idx),$idx ; j++++ 673 ldw -12($xfer),$ablo 674 ldw -8($xfer),$nmhi0 675 ldw -4($xfer),$nmlo0 676 ldw 0($xfer),$hi0 ; high part 677 678 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 679 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 680 fstds ${fab1},0($xfer) 681 addl $abhi,$hi0,$hi0 ; account carry bit 682 fstds ${fnm1},8($xfer) 683 add $ablo,$nmlo0,$nmlo0 ; discarded 684 ldw 0($tp),$ti1 ; tp[1] 685 addc %r0,$nmhi0,$hi1 686 fstds ${fab0},-16($xfer) 687 fstds ${fnm0},-8($xfer) 688 ldw 4($xfer),$ablo 689 ldw 0($xfer),$abhi 690 691L\$inner_pa11 692 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 693 flddx $idx($ap),${fai} ; ap[j,j+1] 694 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 695 flddx $idx($np),${fni} ; np[j,j+1] 696 add $hi0,$ablo,$ablo 697 ldw 4($tp),$ti0 ; tp[j] 698 addc %r0,$abhi,$abhi 699 ldw 12($xfer),$nmlo1 700 add $ti1,$ablo,$ablo 701 ldw 8($xfer),$nmhi1 702 addc %r0,$abhi,$hi0 703 fstds ${fab1},0($xfer) 704 add $ablo,$nmlo1,$nmlo1 705 fstds ${fnm1},8($xfer) 706 addc %r0,$nmhi1,$nmhi1 707 ldw -12($xfer),$ablo 708 add $hi1,$nmlo1,$nmlo1 709 ldw -16($xfer),$abhi 710 addc %r0,$nmhi1,$hi1 711 712 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 713 ldw 8($tp),$ti1 ; tp[j] 714 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 715 ldw -4($xfer),$nmlo0 716 add $hi0,$ablo,$ablo 717 ldw -8($xfer),$nmhi0 718 addc %r0,$abhi,$abhi 719 stw $nmlo1,-4($tp) ; tp[j-1] 720 add $ti0,$ablo,$ablo 721 fstds ${fab0},-16($xfer) 722 addc %r0,$abhi,$hi0 723 fstds ${fnm0},-8($xfer) 724 add $ablo,$nmlo0,$nmlo0 725 ldw 4($xfer),$ablo 726 addc %r0,$nmhi0,$nmhi0 727 ldw 0($xfer),$abhi 728 add $hi1,$nmlo0,$nmlo0 729 stws,ma $nmlo0,8($tp) ; tp[j-1] 730 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 731 addc %r0,$nmhi0,$hi1 732 733 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 734 ldw 12($xfer),$nmlo1 735 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 736 ldw 8($xfer),$nmhi1 737 add $hi0,$ablo,$ablo 738 ldw 4($tp),$ti0 ; tp[j] 739 addc %r0,$abhi,$abhi 740 fstds ${fab1},0($xfer) 741 add $ti1,$ablo,$ablo 742 fstds ${fnm1},8($xfer) 743 addc %r0,$abhi,$hi0 744 ldw -16($xfer),$abhi 745 add $ablo,$nmlo1,$nmlo1 746 ldw -12($xfer),$ablo 747 addc %r0,$nmhi1,$nmhi1 748 ldw -8($xfer),$nmhi0 749 add $hi1,$nmlo1,$nmlo1 750 ldw -4($xfer),$nmlo0 751 addc %r0,$nmhi1,$hi1 752 753 add $hi0,$ablo,$ablo 754 stw $nmlo1,-4($tp) ; tp[j-1] 755 addc %r0,$abhi,$abhi 756 add $ti0,$ablo,$ablo 757 ldw 8($tp),$ti1 ; tp[j] 758 addc %r0,$abhi,$hi0 759 ldw 0($xfer),$abhi 760 add $ablo,$nmlo0,$nmlo0 761 ldw 4($xfer),$ablo 762 addc %r0,$nmhi0,$nmhi0 763 ldws,mb 8($xfer),$nmhi1 764 add $hi1,$nmlo0,$nmlo0 765 ldw 4($xfer),$nmlo1 766 addc %r0,$nmhi0,$hi1 767 stws,ma $nmlo0,8($tp) ; tp[j-1] 768 769 addib,= -1,$num,L\$outerdone_pa11; i-- 770 subi 0,$arrsz,$idx ; j=0 771 772 fldws,ma 4($bp),${fbi} ; bp[i] 773 flddx $idx($ap),${fai} ; ap[0] 774 add $hi0,$ablo,$ablo 775 addc %r0,$abhi,$abhi 776 flddx $idx($np),${fni} ; np[0] 777 fldws 8($xfer),${fti}R ; tp[0] 778 add $ti1,$ablo,$ablo 779 addc %r0,$abhi,$hi0 780 781 ldo 8($idx),$idx ; j++++ 782 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 783 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 784 ldw 4($tp),$ti0 ; tp[j] 785 786 add $hi1,$nmlo1,$nmlo1 787 addc %r0,$nmhi1,$nmhi1 788 fstws,mb ${fab0}L,-8($xfer) ; save high part 789 add $ablo,$nmlo1,$nmlo1 790 addc %r0,$nmhi1,$hi1 791 fcpy,sgl %fr0,${fti}L ; zero high part 792 fcpy,sgl %fr0,${fab0}L 793 stw $nmlo1,-4($tp) ; tp[j-1] 794 795 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 796 fcnvxf,dbl,dbl ${fab0},${fab0} 797 add $hi1,$hi0,$hi0 798 addc %r0,%r0,$hi1 799 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 800 add $ti0,$hi0,$hi0 801 addc %r0,$hi1,$hi1 802 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 803 stw $hi0,0($tp) 804 stw $hi1,4($tp) 805 xmpyu ${fn0},${fab0}R,${fm0} 806 807 b L\$outer_pa11 808 ldo `$LOCALS+32+4`($fp),$tp 809 810L\$outerdone_pa11 811 add $hi0,$ablo,$ablo 812 addc %r0,$abhi,$abhi 813 add $ti1,$ablo,$ablo 814 addc %r0,$abhi,$hi0 815 816 ldw 4($tp),$ti0 ; tp[j] 817 818 add $hi1,$nmlo1,$nmlo1 819 addc %r0,$nmhi1,$nmhi1 820 add $ablo,$nmlo1,$nmlo1 821 addc %r0,$nmhi1,$hi1 822 stw $nmlo1,-4($tp) ; tp[j-1] 823 824 add $hi1,$hi0,$hi0 825 addc %r0,%r0,$hi1 826 add $ti0,$hi0,$hi0 827 addc %r0,$hi1,$hi1 828 stw $hi0,0($tp) 829 stw $hi1,4($tp) 830 831 ldo `$LOCALS+32+4`($fp),$tp 832 sub %r0,%r0,%r0 ; clear borrow 833 ldw -4($tp),$ti0 834 addl $tp,$arrsz,$tp 835L\$sub_pa11 836 ldwx $idx($np),$hi0 837 subb $ti0,$hi0,$hi1 838 ldwx $idx($tp),$ti0 839 addib,<> 4,$idx,L\$sub_pa11 840 stws,ma $hi1,4($rp) 841 842 subb $ti0,%r0,$hi1 843 ldo -4($tp),$tp 844 and $tp,$hi1,$ap 845 andcm $rp,$hi1,$bp 846 or $ap,$bp,$np 847 848 sub $rp,$arrsz,$rp ; rewind rp 849 subi 0,$arrsz,$idx 850 ldo `$LOCALS+32`($fp),$tp 851L\$copy_pa11 852 ldwx $idx($np),$hi0 853 stws,ma %r0,4($tp) 854 addib,<> 4,$idx,L\$copy_pa11 855 stws,ma $hi0,4($rp) 856 857 nop ; alignment 858L\$done 859___ 860} 861 862$code.=<<___; 863 ldi 1,%r28 ; signal "handled" 864 ldo $FRAME($fp),%sp ; destroy tp[num+1] 865 866 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 867 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 868 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 869 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 870 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 871 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 872 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 873 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 874L\$abort 875 bv (%r2) 876 .EXIT 877 $POPMB -$FRAME(%sp),%r3 878 .PROCEND 879___ 880 881# Explicitly encode PA-RISC 2.0 instructions used in this module, so 882# that it can be compiled with .LEVEL 1.0. It should be noted that I 883# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 884# directive... 885 886my $ldd = sub { 887 my ($mod,$args) = @_; 888 my $orig = "ldd$mod\t$args"; 889 890 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 891 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 892 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 893 } 894 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 895 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 896 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 897 $opcode|=(1<<5) if ($mod =~ /^,m/); 898 $opcode|=(1<<13) if ($mod =~ /^,mb/); 899 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 900 } 901 else { "\t".$orig; } 902}; 903 904my $std = sub { 905 my ($mod,$args) = @_; 906 my $orig = "std$mod\t$args"; 907 908 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 909 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 910 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 911 $opcode|=(1<<5) if ($mod =~ /^,m/); 912 $opcode|=(1<<13) if ($mod =~ /^,mb/); 913 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 914 } 915 else { "\t".$orig; } 916}; 917 918my $extrd = sub { 919 my ($mod,$args) = @_; 920 my $orig = "extrd$mod\t$args"; 921 922 # I only have ",u" completer, it's implicitly encoded... 923 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 924 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 925 my $len=32-$3; 926 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 927 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 928 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 929 } 930 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 931 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 932 my $len=32-$2; 933 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 934 $opcode |= (1<<13) if ($mod =~ /,\**=/); 935 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 936 } 937 else { "\t".$orig; } 938}; 939 940my $shrpd = sub { 941 my ($mod,$args) = @_; 942 my $orig = "shrpd$mod\t$args"; 943 944 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 945 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 946 my $cpos=63-$3; 947 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 948 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 949 } 950 else { "\t".$orig; } 951}; 952 953my $sub = sub { 954 my ($mod,$args) = @_; 955 my $orig = "sub$mod\t$args"; 956 957 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 958 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 959 $opcode|=(1<<10); # e1 960 $opcode|=(1<<8); # e2 961 $opcode|=(1<<5); # d 962 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 963 } 964 else { "\t".$orig; } 965}; 966 967sub assemble { 968 my ($mnemonic,$mod,$args)=@_; 969 my $opcode = eval("\$$mnemonic"); 970 971 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 972} 973 974foreach (split("\n",$code)) { 975 s/\`([^\`]*)\`/eval $1/ge; 976 # flip word order in 64-bit mode... 977 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 978 # assemble 2.0 instructions in 32-bit mode... 979 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 980 981 s/\bbv\b/bve/gm if ($SIZE_T==8); 982 983 print $_,"\n"; 984} 985close STDOUT; 986