1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# October 2005 11# 12# This is a "teaser" code, as it can be improved in several ways... 13# First of all non-SSE2 path should be implemented (yes, for now it 14# performs Montgomery multiplication/convolution only on SSE2-capable 15# CPUs such as P4, others fall down to original code). Then inner loop 16# can be unrolled and modulo-scheduled to improve ILP and possibly 17# moved to 128-bit XMM register bank (though it would require input 18# rearrangement and/or increase bus bandwidth utilization). Dedicated 19# squaring procedure should give further performance improvement... 20# Yet, for being draft, the code improves rsa512 *sign* benchmark by 21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) 22 23# December 2006 24# 25# Modulo-scheduling SSE2 loops results in further 15-20% improvement. 26# Integer-only code [being equipped with dedicated squaring procedure] 27# gives ~40% on rsa512 sign benchmark... 28 29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30push(@INC,"${dir}","${dir}../../perlasm"); 31require "x86asm.pl"; 32 33&asm_init($ARGV[0],$0); 34 35$sse2=0; 36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 37 38&external_label("OPENSSL_ia32cap_P") if ($sse2); 39 40&function_begin("bn_mul_mont"); 41 42$i="edx"; 43$j="ecx"; 44$ap="esi"; $tp="esi"; # overlapping variables!!! 45$rp="edi"; $bp="edi"; # overlapping variables!!! 46$np="ebp"; 47$num="ebx"; 48 49$_num=&DWP(4*0,"esp"); # stack top layout 50$_rp=&DWP(4*1,"esp"); 51$_ap=&DWP(4*2,"esp"); 52$_bp=&DWP(4*3,"esp"); 53$_np=&DWP(4*4,"esp"); 54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); 55$_sp=&DWP(4*6,"esp"); 56$_bpend=&DWP(4*7,"esp"); 57$frame=32; # size of above frame rounded up to 16n 58 59 &xor ("eax","eax"); 60 &mov ("edi",&wparam(5)); # int num 61 &cmp ("edi",4); 62 &jl (&label("just_leave")); 63 64 &lea ("esi",&wparam(0)); # put aside pointer to argument block 65 &lea ("edx",&wparam(1)); # load ap 66 &mov ("ebp","esp"); # saved stack pointer! 67 &add ("edi",2); # extra two words on top of tp 68 &neg ("edi"); 69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) 70 &neg ("edi"); 71 72 # minimize cache contention by arranging 2K window between stack 73 # pointer and ap argument [np is also position sensitive vector, 74 # but it's assumed to be near ap, as it's allocated at ~same 75 # time]. 76 &mov ("eax","esp"); 77 &sub ("eax","edx"); 78 &and ("eax",2047); 79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048 80 81 &xor ("edx","esp"); 82 &and ("edx",2048); 83 &xor ("edx",2048); 84 &sub ("esp","edx"); # this splits them apart modulo 4096 85 86 &and ("esp",-64); # align to cache line 87 88 ################################# load argument block... 89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp 90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap 91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp 92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np 93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 94 #&mov ("edi",&DWP(5*4,"esi"));# int num 95 96 &mov ("esi",&DWP(0,"esi")); # pull n0[0] 97 &mov ($_rp,"eax"); # ... save a copy of argument block 98 &mov ($_ap,"ebx"); 99 &mov ($_bp,"ecx"); 100 &mov ($_np,"edx"); 101 &mov ($_n0,"esi"); 102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling 103 #&mov ($_num,$num); # redundant as $num is not reused 104 &mov ($_sp,"ebp"); # saved stack pointer! 105 106if($sse2) { 107$acc0="mm0"; # mmx register bank layout 108$acc1="mm1"; 109$car0="mm2"; 110$car1="mm3"; 111$mul0="mm4"; 112$mul1="mm5"; 113$temp="mm6"; 114$mask="mm7"; 115 116 &picsetup("eax"); 117 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 118 &bt (&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 119 &jnc (&label("non_sse2")); 120 121 &mov ("eax",-1); 122 &movd ($mask,"eax"); # mask 32 lower bits 123 124 &mov ($ap,$_ap); # load input pointers 125 &mov ($bp,$_bp); 126 &mov ($np,$_np); 127 128 &xor ($i,$i); # i=0 129 &xor ($j,$j); # j=0 130 131 &movd ($mul0,&DWP(0,$bp)); # bp[0] 132 &movd ($mul1,&DWP(0,$ap)); # ap[0] 133 &movd ($car1,&DWP(0,$np)); # np[0] 134 135 &pmuludq($mul1,$mul0); # ap[0]*bp[0] 136 &movq ($car0,$mul1); 137 &movq ($acc0,$mul1); # I wish movd worked for 138 &pand ($acc0,$mask); # inter-register transfers 139 140 &pmuludq($mul1,$_n0q); # *=n0 141 142 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 143 &paddq ($car1,$acc0); 144 145 &movd ($acc1,&DWP(4,$np)); # np[1] 146 &movd ($acc0,&DWP(4,$ap)); # ap[1] 147 148 &psrlq ($car0,32); 149 &psrlq ($car1,32); 150 151 &inc ($j); # j++ 152&set_label("1st",16); 153 &pmuludq($acc0,$mul0); # ap[j]*bp[0] 154 &pmuludq($acc1,$mul1); # np[j]*m1 155 &paddq ($car0,$acc0); # +=c0 156 &paddq ($car1,$acc1); # +=c1 157 158 &movq ($acc0,$car0); 159 &pand ($acc0,$mask); 160 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 161 &paddq ($car1,$acc0); # +=ap[j]*bp[0]; 162 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 163 &psrlq ($car0,32); 164 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= 165 &psrlq ($car1,32); 166 167 &lea ($j,&DWP(1,$j)); 168 &cmp ($j,$num); 169 &jl (&label("1st")); 170 171 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] 172 &pmuludq($acc1,$mul1); # np[num-1]*m1 173 &paddq ($car0,$acc0); # +=c0 174 &paddq ($car1,$acc1); # +=c1 175 176 &movq ($acc0,$car0); 177 &pand ($acc0,$mask); 178 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; 179 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 180 181 &psrlq ($car0,32); 182 &psrlq ($car1,32); 183 184 &paddq ($car1,$car0); 185 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 186 187 &inc ($i); # i++ 188&set_label("outer"); 189 &xor ($j,$j); # j=0 190 191 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] 192 &movd ($mul1,&DWP(0,$ap)); # ap[0] 193 &movd ($temp,&DWP($frame,"esp")); # tp[0] 194 &movd ($car1,&DWP(0,$np)); # np[0] 195 &pmuludq($mul1,$mul0); # ap[0]*bp[i] 196 197 &paddq ($mul1,$temp); # +=tp[0] 198 &movq ($acc0,$mul1); 199 &movq ($car0,$mul1); 200 &pand ($acc0,$mask); 201 202 &pmuludq($mul1,$_n0q); # *=n0 203 204 &pmuludq($car1,$mul1); 205 &paddq ($car1,$acc0); 206 207 &movd ($temp,&DWP($frame+4,"esp")); # tp[1] 208 &movd ($acc1,&DWP(4,$np)); # np[1] 209 &movd ($acc0,&DWP(4,$ap)); # ap[1] 210 211 &psrlq ($car0,32); 212 &psrlq ($car1,32); 213 &paddq ($car0,$temp); # +=tp[1] 214 215 &inc ($j); # j++ 216 &dec ($num); 217&set_label("inner"); 218 &pmuludq($acc0,$mul0); # ap[j]*bp[i] 219 &pmuludq($acc1,$mul1); # np[j]*m1 220 &paddq ($car0,$acc0); # +=c0 221 &paddq ($car1,$acc1); # +=c1 222 223 &movq ($acc0,$car0); 224 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] 225 &pand ($acc0,$mask); 226 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 227 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] 228 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 229 &psrlq ($car0,32); 230 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= 231 &psrlq ($car1,32); 232 &paddq ($car0,$temp); # +=tp[j+1] 233 234 &dec ($num); 235 &lea ($j,&DWP(1,$j)); # j++ 236 &jnz (&label("inner")); 237 238 &mov ($num,$j); 239 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] 240 &pmuludq($acc1,$mul1); # np[num-1]*m1 241 &paddq ($car0,$acc0); # +=c0 242 &paddq ($car1,$acc1); # +=c1 243 244 &movq ($acc0,$car0); 245 &pand ($acc0,$mask); 246 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] 247 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 248 &psrlq ($car0,32); 249 &psrlq ($car1,32); 250 251 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] 252 &paddq ($car1,$car0); 253 &paddq ($car1,$temp); 254 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 255 256 &lea ($i,&DWP(1,$i)); # i++ 257 &cmp ($i,$num); 258 &jle (&label("outer")); 259 260 &emms (); # done with mmx bank 261 &jmp (&label("common_tail")); 262 263&set_label("non_sse2",16); 264} 265 266if (0) { 267 &mov ("esp",$_sp); 268 &xor ("eax","eax"); # signal "not fast enough [yet]" 269 &jmp (&label("just_leave")); 270 # While the below code provides competitive performance for 271 # all key lengths on modern Intel cores, it's still more 272 # than 10% slower for 4096-bit key elsewhere:-( "Competitive" 273 # means compared to the original integer-only assembler. 274 # 512-bit RSA sign is better by ~40%, but that's about all 275 # one can say about all CPUs... 276} else { 277$inp="esi"; # integer path uses these registers differently 278$word="edi"; 279$carry="ebp"; 280 281 &mov ($inp,$_ap); 282 &lea ($carry,&DWP(1,$num)); 283 &mov ($word,$_bp); 284 &xor ($j,$j); # j=0 285 &mov ("edx",$inp); 286 &and ($carry,1); # see if num is even 287 &sub ("edx",$word); # see if ap==bp 288 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] 289 &or ($carry,"edx"); 290 &mov ($word,&DWP(0,$word)); # bp[0] 291 &jz (&label("bn_sqr_mont")); 292 &mov ($_bpend,"eax"); 293 &mov ("eax",&DWP(0,$inp)); 294 &xor ("edx","edx"); 295 296&set_label("mull",16); 297 &mov ($carry,"edx"); 298 &mul ($word); # ap[j]*bp[0] 299 &add ($carry,"eax"); 300 &lea ($j,&DWP(1,$j)); 301 &adc ("edx",0); 302 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 303 &cmp ($j,$num); 304 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 305 &jl (&label("mull")); 306 307 &mov ($carry,"edx"); 308 &mul ($word); # ap[num-1]*bp[0] 309 &mov ($word,$_n0); 310 &add ("eax",$carry); 311 &mov ($inp,$_np); 312 &adc ("edx",0); 313 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 314 315 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= 316 &xor ($j,$j); 317 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 318 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 319 320 &mov ("eax",&DWP(0,$inp)); # np[0] 321 &mul ($word); # np[0]*m 322 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 323 &mov ("eax",&DWP(4,$inp)); # np[1] 324 &adc ("edx",0); 325 &inc ($j); 326 327 &jmp (&label("2ndmadd")); 328 329&set_label("1stmadd",16); 330 &mov ($carry,"edx"); 331 &mul ($word); # ap[j]*bp[i] 332 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 333 &lea ($j,&DWP(1,$j)); 334 &adc ("edx",0); 335 &add ($carry,"eax"); 336 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 337 &adc ("edx",0); 338 &cmp ($j,$num); 339 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 340 &jl (&label("1stmadd")); 341 342 &mov ($carry,"edx"); 343 &mul ($word); # ap[num-1]*bp[i] 344 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] 345 &mov ($word,$_n0); 346 &adc ("edx",0); 347 &mov ($inp,$_np); 348 &add ($carry,"eax"); 349 &adc ("edx",0); 350 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 351 352 &xor ($j,$j); 353 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 354 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= 355 &adc ($j,0); 356 &mov ("eax",&DWP(0,$inp)); # np[0] 357 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 358 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 359 360 &mul ($word); # np[0]*m 361 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 362 &mov ("eax",&DWP(4,$inp)); # np[1] 363 &adc ("edx",0); 364 &mov ($j,1); 365 366&set_label("2ndmadd",16); 367 &mov ($carry,"edx"); 368 &mul ($word); # np[j]*m 369 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 370 &lea ($j,&DWP(1,$j)); 371 &adc ("edx",0); 372 &add ($carry,"eax"); 373 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] 374 &adc ("edx",0); 375 &cmp ($j,$num); 376 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= 377 &jl (&label("2ndmadd")); 378 379 &mov ($carry,"edx"); 380 &mul ($word); # np[j]*m 381 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 382 &adc ("edx",0); 383 &add ($carry,"eax"); 384 &adc ("edx",0); 385 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 386 387 &xor ("eax","eax"); 388 &mov ($j,$_bp); # &bp[i] 389 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 390 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 391 &lea ($j,&DWP(4,$j)); 392 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 393 &cmp ($j,$_bpend); 394 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 395 &je (&label("common_tail")); 396 397 &mov ($word,&DWP(0,$j)); # bp[i+1] 398 &mov ($inp,$_ap); 399 &mov ($_bp,$j); # &bp[++i] 400 &xor ($j,$j); 401 &xor ("edx","edx"); 402 &mov ("eax",&DWP(0,$inp)); 403 &jmp (&label("1stmadd")); 404 405&set_label("bn_sqr_mont",16); 406$sbit=$num; 407 &mov ($_num,$num); 408 &mov ($_bp,$j); # i=0 409 410 &mov ("eax",$word); # ap[0] 411 &mul ($word); # ap[0]*ap[0] 412 &mov (&DWP($frame,"esp"),"eax"); # tp[0]= 413 &mov ($sbit,"edx"); 414 &shr ("edx",1); 415 &and ($sbit,1); 416 &inc ($j); 417&set_label("sqr",16); 418 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 419 &mov ($carry,"edx"); 420 &mul ($word); # ap[j]*ap[0] 421 &add ("eax",$carry); 422 &lea ($j,&DWP(1,$j)); 423 &adc ("edx",0); 424 &lea ($carry,&DWP(0,$sbit,"eax",2)); 425 &shr ("eax",31); 426 &cmp ($j,$_num); 427 &mov ($sbit,"eax"); 428 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 429 &jl (&label("sqr")); 430 431 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] 432 &mov ($carry,"edx"); 433 &mul ($word); # ap[num-1]*ap[0] 434 &add ("eax",$carry); 435 &mov ($word,$_n0); 436 &adc ("edx",0); 437 &mov ($inp,$_np); 438 &lea ($carry,&DWP(0,$sbit,"eax",2)); 439 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 440 &shr ("eax",31); 441 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= 442 443 &lea ($carry,&DWP(0,"eax","edx",2)); 444 &mov ("eax",&DWP(0,$inp)); # np[0] 445 &shr ("edx",31); 446 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= 447 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= 448 449 &mul ($word); # np[0]*m 450 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 451 &mov ($num,$j); 452 &adc ("edx",0); 453 &mov ("eax",&DWP(4,$inp)); # np[1] 454 &mov ($j,1); 455 456&set_label("3rdmadd",16); 457 &mov ($carry,"edx"); 458 &mul ($word); # np[j]*m 459 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 460 &adc ("edx",0); 461 &add ($carry,"eax"); 462 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] 463 &adc ("edx",0); 464 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= 465 466 &mov ($carry,"edx"); 467 &mul ($word); # np[j+1]*m 468 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] 469 &lea ($j,&DWP(2,$j)); 470 &adc ("edx",0); 471 &add ($carry,"eax"); 472 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] 473 &adc ("edx",0); 474 &cmp ($j,$num); 475 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= 476 &jl (&label("3rdmadd")); 477 478 &mov ($carry,"edx"); 479 &mul ($word); # np[j]*m 480 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 481 &adc ("edx",0); 482 &add ($carry,"eax"); 483 &adc ("edx",0); 484 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 485 486 &mov ($j,$_bp); # i 487 &xor ("eax","eax"); 488 &mov ($inp,$_ap); 489 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 490 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 491 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 492 &cmp ($j,$num); 493 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 494 &je (&label("common_tail")); 495 496 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] 497 &lea ($j,&DWP(1,$j)); 498 &mov ("eax",$word); 499 &mov ($_bp,$j); # ++i 500 &mul ($word); # ap[i]*ap[i] 501 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] 502 &adc ("edx",0); 503 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= 504 &xor ($carry,$carry); 505 &cmp ($j,$num); 506 &lea ($j,&DWP(1,$j)); 507 &je (&label("sqrlast")); 508 509 &mov ($sbit,"edx"); # zaps $num 510 &shr ("edx",1); 511 &and ($sbit,1); 512&set_label("sqradd",16); 513 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 514 &mov ($carry,"edx"); 515 &mul ($word); # ap[j]*ap[i] 516 &add ("eax",$carry); 517 &lea ($carry,&DWP(0,"eax","eax")); 518 &adc ("edx",0); 519 &shr ("eax",31); 520 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 521 &lea ($j,&DWP(1,$j)); 522 &adc ("eax",0); 523 &add ($carry,$sbit); 524 &adc ("eax",0); 525 &cmp ($j,$_num); 526 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 527 &mov ($sbit,"eax"); 528 &jle (&label("sqradd")); 529 530 &mov ($carry,"edx"); 531 &add ("edx","edx"); 532 &shr ($carry,31); 533 &add ("edx",$sbit); 534 &adc ($carry,0); 535&set_label("sqrlast"); 536 &mov ($word,$_n0); 537 &mov ($inp,$_np); 538 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 539 540 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] 541 &mov ("eax",&DWP(0,$inp)); # np[0] 542 &adc ($carry,0); 543 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= 544 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= 545 546 &mul ($word); # np[0]*m 547 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 548 &lea ($num,&DWP(-1,$j)); 549 &adc ("edx",0); 550 &mov ($j,1); 551 &mov ("eax",&DWP(4,$inp)); # np[1] 552 553 &jmp (&label("3rdmadd")); 554} 555 556&set_label("common_tail",16); 557 &mov ($np,$_np); # load modulus pointer 558 &mov ($rp,$_rp); # load result pointer 559 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] 560 561 &mov ("eax",&DWP(0,$tp)); # tp[0] 562 &mov ($j,$num); # j=num-1 563 &xor ($i,$i); # i=0 and clear CF! 564 565&set_label("sub",16); 566 &sbb ("eax",&DWP(0,$np,$i,4)); 567 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] 568 &dec ($j); # doesn't affect CF! 569 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] 570 &lea ($i,&DWP(1,$i)); # i++ 571 &jge (&label("sub")); 572 573 &sbb ("eax",0); # handle upmost overflow bit 574 &and ($tp,"eax"); 575 ¬ ("eax"); 576 &mov ($np,$rp); 577 &and ($np,"eax"); 578 &or ($tp,$np); # tp=carry?tp:rp 579 580&set_label("copy",16); # copy or in-place refresh 581 &mov ("eax",&DWP(0,$tp,$num,4)); 582 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] 583 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector 584 &dec ($num); 585 &jge (&label("copy")); 586 587 &mov ("esp",$_sp); # pull saved stack pointer 588 &mov ("eax",1); 589&set_label("just_leave"); 590&function_end("bn_mul_mont"); 591 592&asm_finish(); 593