1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34# $output is the last argument if it looks like a file (it has an extension) 35# $flavour is the first argument if it doesn't look like a file 36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 42die "can't locate arm-xlate.pl"; 43 44open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 45 or die "can't call $xlate: $!"; 46*STDOUT=*OUT; 47 48{ 49my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 50 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 51 map("x$_",(0..17,19,20)); 52 53my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 54 55$code.=<<___; 56#include "arm_arch.h" 57 58.text 59___ 60######################################################################## 61# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 62# 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64open TABLE,"<ecp_nistz256_table.c" or 65open TABLE,"<${dir}../ecp_nistz256_table.c" or 66die "failed to open ecp_nistz256_table.c:",$!; 67 68use integer; 69 70foreach(<TABLE>) { 71 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 72} 73close TABLE; 74 75# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 76# 64*16*37-1 is because $#arr returns last valid index or @arr, not 77# amount of elements. 78die "insane number of elements" if ($#arr != 64*16*37-1); 79 80$code.=<<___; 81.globl ecp_nistz256_precomputed 82.type ecp_nistz256_precomputed,%object 83.align 12 84ecp_nistz256_precomputed: 85___ 86######################################################################## 87# this conversion smashes P256_POINT_AFFINE by individual bytes with 88# 64 byte interval, similar to 89# 1111222233334444 90# 1234123412341234 91for(1..37) { 92 @tbl = splice(@arr,0,64*16); 93 for($i=0;$i<64;$i++) { 94 undef @line; 95 for($j=0;$j<64;$j++) { 96 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 97 } 98 $code.=".byte\t"; 99 $code.=join(',',map { sprintf "0x%02x",$_} @line); 100 $code.="\n"; 101 } 102} 103$code.=<<___; 104.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 105.align 5 106.Lpoly: 107.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 108.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 109.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 110.Lone_mont: 111.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 112.Lone: 113.quad 1,0,0,0 114.Lord: 115.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 116.LordK: 117.quad 0xccd1c8aaee00bc4f 118.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 119 120// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 121.globl ecp_nistz256_to_mont 122.type ecp_nistz256_to_mont,%function 123.align 6 124ecp_nistz256_to_mont: 125 .inst 0xd503233f // paciasp 126 stp x29,x30,[sp,#-32]! 127 add x29,sp,#0 128 stp x19,x20,[sp,#16] 129 130 ldr $bi,.LRR // bp[0] 131 ldp $a0,$a1,[$ap] 132 ldp $a2,$a3,[$ap,#16] 133 ldr $poly1,.Lpoly+8 134 ldr $poly3,.Lpoly+24 135 adr $bp,.LRR // &bp[0] 136 137 bl __ecp_nistz256_mul_mont 138 139 ldp x19,x20,[sp,#16] 140 ldp x29,x30,[sp],#32 141 .inst 0xd50323bf // autiasp 142 ret 143.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 144 145// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 146.globl ecp_nistz256_from_mont 147.type ecp_nistz256_from_mont,%function 148.align 4 149ecp_nistz256_from_mont: 150 .inst 0xd503233f // paciasp 151 stp x29,x30,[sp,#-32]! 152 add x29,sp,#0 153 stp x19,x20,[sp,#16] 154 155 mov $bi,#1 // bp[0] 156 ldp $a0,$a1,[$ap] 157 ldp $a2,$a3,[$ap,#16] 158 ldr $poly1,.Lpoly+8 159 ldr $poly3,.Lpoly+24 160 adr $bp,.Lone // &bp[0] 161 162 bl __ecp_nistz256_mul_mont 163 164 ldp x19,x20,[sp,#16] 165 ldp x29,x30,[sp],#32 166 .inst 0xd50323bf // autiasp 167 ret 168.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 169 170// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 171// const BN_ULONG x2[4]); 172.globl ecp_nistz256_mul_mont 173.type ecp_nistz256_mul_mont,%function 174.align 4 175ecp_nistz256_mul_mont: 176 .inst 0xd503233f // paciasp 177 stp x29,x30,[sp,#-32]! 178 add x29,sp,#0 179 stp x19,x20,[sp,#16] 180 181 ldr $bi,[$bp] // bp[0] 182 ldp $a0,$a1,[$ap] 183 ldp $a2,$a3,[$ap,#16] 184 ldr $poly1,.Lpoly+8 185 ldr $poly3,.Lpoly+24 186 187 bl __ecp_nistz256_mul_mont 188 189 ldp x19,x20,[sp,#16] 190 ldp x29,x30,[sp],#32 191 .inst 0xd50323bf // autiasp 192 ret 193.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 194 195// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 196.globl ecp_nistz256_sqr_mont 197.type ecp_nistz256_sqr_mont,%function 198.align 4 199ecp_nistz256_sqr_mont: 200 .inst 0xd503233f // paciasp 201 stp x29,x30,[sp,#-32]! 202 add x29,sp,#0 203 stp x19,x20,[sp,#16] 204 205 ldp $a0,$a1,[$ap] 206 ldp $a2,$a3,[$ap,#16] 207 ldr $poly1,.Lpoly+8 208 ldr $poly3,.Lpoly+24 209 210 bl __ecp_nistz256_sqr_mont 211 212 ldp x19,x20,[sp,#16] 213 ldp x29,x30,[sp],#32 214 .inst 0xd50323bf // autiasp 215 ret 216.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 217 218// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 219// const BN_ULONG x2[4]); 220.globl ecp_nistz256_add 221.type ecp_nistz256_add,%function 222.align 4 223ecp_nistz256_add: 224 .inst 0xd503233f // paciasp 225 stp x29,x30,[sp,#-16]! 226 add x29,sp,#0 227 228 ldp $acc0,$acc1,[$ap] 229 ldp $t0,$t1,[$bp] 230 ldp $acc2,$acc3,[$ap,#16] 231 ldp $t2,$t3,[$bp,#16] 232 ldr $poly1,.Lpoly+8 233 ldr $poly3,.Lpoly+24 234 235 bl __ecp_nistz256_add 236 237 ldp x29,x30,[sp],#16 238 .inst 0xd50323bf // autiasp 239 ret 240.size ecp_nistz256_add,.-ecp_nistz256_add 241 242// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 243.globl ecp_nistz256_div_by_2 244.type ecp_nistz256_div_by_2,%function 245.align 4 246ecp_nistz256_div_by_2: 247 .inst 0xd503233f // paciasp 248 stp x29,x30,[sp,#-16]! 249 add x29,sp,#0 250 251 ldp $acc0,$acc1,[$ap] 252 ldp $acc2,$acc3,[$ap,#16] 253 ldr $poly1,.Lpoly+8 254 ldr $poly3,.Lpoly+24 255 256 bl __ecp_nistz256_div_by_2 257 258 ldp x29,x30,[sp],#16 259 .inst 0xd50323bf // autiasp 260 ret 261.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 262 263// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 264.globl ecp_nistz256_mul_by_2 265.type ecp_nistz256_mul_by_2,%function 266.align 4 267ecp_nistz256_mul_by_2: 268 .inst 0xd503233f // paciasp 269 stp x29,x30,[sp,#-16]! 270 add x29,sp,#0 271 272 ldp $acc0,$acc1,[$ap] 273 ldp $acc2,$acc3,[$ap,#16] 274 ldr $poly1,.Lpoly+8 275 ldr $poly3,.Lpoly+24 276 mov $t0,$acc0 277 mov $t1,$acc1 278 mov $t2,$acc2 279 mov $t3,$acc3 280 281 bl __ecp_nistz256_add // ret = a+a // 2*a 282 283 ldp x29,x30,[sp],#16 284 .inst 0xd50323bf // autiasp 285 ret 286.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 287 288// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 289.globl ecp_nistz256_mul_by_3 290.type ecp_nistz256_mul_by_3,%function 291.align 4 292ecp_nistz256_mul_by_3: 293 .inst 0xd503233f // paciasp 294 stp x29,x30,[sp,#-16]! 295 add x29,sp,#0 296 297 ldp $acc0,$acc1,[$ap] 298 ldp $acc2,$acc3,[$ap,#16] 299 ldr $poly1,.Lpoly+8 300 ldr $poly3,.Lpoly+24 301 mov $t0,$acc0 302 mov $t1,$acc1 303 mov $t2,$acc2 304 mov $t3,$acc3 305 mov $a0,$acc0 306 mov $a1,$acc1 307 mov $a2,$acc2 308 mov $a3,$acc3 309 310 bl __ecp_nistz256_add // ret = a+a // 2*a 311 312 mov $t0,$a0 313 mov $t1,$a1 314 mov $t2,$a2 315 mov $t3,$a3 316 317 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a 318 319 ldp x29,x30,[sp],#16 320 .inst 0xd50323bf // autiasp 321 ret 322.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 323 324// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 325// const BN_ULONG x2[4]); 326.globl ecp_nistz256_sub 327.type ecp_nistz256_sub,%function 328.align 4 329ecp_nistz256_sub: 330 .inst 0xd503233f // paciasp 331 stp x29,x30,[sp,#-16]! 332 add x29,sp,#0 333 334 ldp $acc0,$acc1,[$ap] 335 ldp $acc2,$acc3,[$ap,#16] 336 ldr $poly1,.Lpoly+8 337 ldr $poly3,.Lpoly+24 338 339 bl __ecp_nistz256_sub_from 340 341 ldp x29,x30,[sp],#16 342 .inst 0xd50323bf // autiasp 343 ret 344.size ecp_nistz256_sub,.-ecp_nistz256_sub 345 346// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 347.globl ecp_nistz256_neg 348.type ecp_nistz256_neg,%function 349.align 4 350ecp_nistz256_neg: 351 .inst 0xd503233f // paciasp 352 stp x29,x30,[sp,#-16]! 353 add x29,sp,#0 354 355 mov $bp,$ap 356 mov $acc0,xzr // a = 0 357 mov $acc1,xzr 358 mov $acc2,xzr 359 mov $acc3,xzr 360 ldr $poly1,.Lpoly+8 361 ldr $poly3,.Lpoly+24 362 363 bl __ecp_nistz256_sub_from 364 365 ldp x29,x30,[sp],#16 366 .inst 0xd50323bf // autiasp 367 ret 368.size ecp_nistz256_neg,.-ecp_nistz256_neg 369 370// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 371// to $a0-$a3 and b[0] - to $bi 372.type __ecp_nistz256_mul_mont,%function 373.align 4 374__ecp_nistz256_mul_mont: 375 mul $acc0,$a0,$bi // a[0]*b[0] 376 umulh $t0,$a0,$bi 377 378 mul $acc1,$a1,$bi // a[1]*b[0] 379 umulh $t1,$a1,$bi 380 381 mul $acc2,$a2,$bi // a[2]*b[0] 382 umulh $t2,$a2,$bi 383 384 mul $acc3,$a3,$bi // a[3]*b[0] 385 umulh $t3,$a3,$bi 386 ldr $bi,[$bp,#8] // b[1] 387 388 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 389 lsl $t0,$acc0,#32 390 adcs $acc2,$acc2,$t1 391 lsr $t1,$acc0,#32 392 adcs $acc3,$acc3,$t2 393 adc $acc4,xzr,$t3 394 mov $acc5,xzr 395___ 396for($i=1;$i<4;$i++) { 397 # Reduction iteration is normally performed by accumulating 398 # result of multiplication of modulus by "magic" digit [and 399 # omitting least significant word, which is guaranteed to 400 # be 0], but thanks to special form of modulus and "magic" 401 # digit being equal to least significant word, it can be 402 # performed with additions and subtractions alone. Indeed: 403 # 404 # ffff0001.00000000.0000ffff.ffffffff 405 # * abcdefgh 406 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 407 # 408 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 409 # rewrite above as: 410 # 411 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 412 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 413 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 414 # 415 # or marking redundant operations: 416 # 417 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 418 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 419 # - 0000abcd.efgh0000.--------.--------.-------- 420 421$code.=<<___; 422 subs $t2,$acc0,$t0 // "*0xffff0001" 423 sbc $t3,$acc0,$t1 424 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 425 mul $t0,$a0,$bi // lo(a[0]*b[i]) 426 adcs $acc1,$acc2,$t1 427 mul $t1,$a1,$bi // lo(a[1]*b[i]) 428 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 429 mul $t2,$a2,$bi // lo(a[2]*b[i]) 430 adcs $acc3,$acc4,$t3 431 mul $t3,$a3,$bi // lo(a[3]*b[i]) 432 adc $acc4,$acc5,xzr 433 434 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 435 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 436 adcs $acc1,$acc1,$t1 437 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 438 adcs $acc2,$acc2,$t2 439 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 440 adcs $acc3,$acc3,$t3 441 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 442 adc $acc4,$acc4,xzr 443___ 444$code.=<<___ if ($i<3); 445 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 446___ 447$code.=<<___; 448 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 449 lsl $t0,$acc0,#32 450 adcs $acc2,$acc2,$t1 451 lsr $t1,$acc0,#32 452 adcs $acc3,$acc3,$t2 453 adcs $acc4,$acc4,$t3 454 adc $acc5,xzr,xzr 455___ 456} 457$code.=<<___; 458 // last reduction 459 subs $t2,$acc0,$t0 // "*0xffff0001" 460 sbc $t3,$acc0,$t1 461 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 462 adcs $acc1,$acc2,$t1 463 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 464 adcs $acc3,$acc4,$t3 465 adc $acc4,$acc5,xzr 466 467 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 468 sbcs $t1,$acc1,$poly1 469 sbcs $t2,$acc2,xzr 470 sbcs $t3,$acc3,$poly3 471 sbcs xzr,$acc4,xzr // did it borrow? 472 473 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 474 csel $acc1,$acc1,$t1,lo 475 csel $acc2,$acc2,$t2,lo 476 stp $acc0,$acc1,[$rp] 477 csel $acc3,$acc3,$t3,lo 478 stp $acc2,$acc3,[$rp,#16] 479 480 ret 481.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 482 483// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 484// to $a0-$a3 485.type __ecp_nistz256_sqr_mont,%function 486.align 4 487__ecp_nistz256_sqr_mont: 488 // | | | | | |a1*a0| | 489 // | | | | |a2*a0| | | 490 // | |a3*a2|a3*a0| | | | 491 // | | | |a2*a1| | | | 492 // | | |a3*a1| | | | | 493 // *| | | | | | | | 2| 494 // +|a3*a3|a2*a2|a1*a1|a0*a0| 495 // |--+--+--+--+--+--+--+--| 496 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 497 // 498 // "can't overflow" below mark carrying into high part of 499 // multiplication result, which can't overflow, because it 500 // can never be all ones. 501 502 mul $acc1,$a1,$a0 // a[1]*a[0] 503 umulh $t1,$a1,$a0 504 mul $acc2,$a2,$a0 // a[2]*a[0] 505 umulh $t2,$a2,$a0 506 mul $acc3,$a3,$a0 // a[3]*a[0] 507 umulh $acc4,$a3,$a0 508 509 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 510 mul $t0,$a2,$a1 // a[2]*a[1] 511 umulh $t1,$a2,$a1 512 adcs $acc3,$acc3,$t2 513 mul $t2,$a3,$a1 // a[3]*a[1] 514 umulh $t3,$a3,$a1 515 adc $acc4,$acc4,xzr // can't overflow 516 517 mul $acc5,$a3,$a2 // a[3]*a[2] 518 umulh $acc6,$a3,$a2 519 520 adds $t1,$t1,$t2 // accumulate high parts of multiplication 521 mul $acc0,$a0,$a0 // a[0]*a[0] 522 adc $t2,$t3,xzr // can't overflow 523 524 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 525 umulh $a0,$a0,$a0 526 adcs $acc4,$acc4,$t1 527 mul $t1,$a1,$a1 // a[1]*a[1] 528 adcs $acc5,$acc5,$t2 529 umulh $a1,$a1,$a1 530 adc $acc6,$acc6,xzr // can't overflow 531 532 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 533 mul $t2,$a2,$a2 // a[2]*a[2] 534 adcs $acc2,$acc2,$acc2 535 umulh $a2,$a2,$a2 536 adcs $acc3,$acc3,$acc3 537 mul $t3,$a3,$a3 // a[3]*a[3] 538 adcs $acc4,$acc4,$acc4 539 umulh $a3,$a3,$a3 540 adcs $acc5,$acc5,$acc5 541 adcs $acc6,$acc6,$acc6 542 adc $acc7,xzr,xzr 543 544 adds $acc1,$acc1,$a0 // +a[i]*a[i] 545 adcs $acc2,$acc2,$t1 546 adcs $acc3,$acc3,$a1 547 adcs $acc4,$acc4,$t2 548 adcs $acc5,$acc5,$a2 549 lsl $t0,$acc0,#32 550 adcs $acc6,$acc6,$t3 551 lsr $t1,$acc0,#32 552 adc $acc7,$acc7,$a3 553___ 554for($i=0;$i<3;$i++) { # reductions, see commentary in 555 # multiplication for details 556$code.=<<___; 557 subs $t2,$acc0,$t0 // "*0xffff0001" 558 sbc $t3,$acc0,$t1 559 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 560 adcs $acc1,$acc2,$t1 561 lsl $t0,$acc0,#32 562 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 563 lsr $t1,$acc0,#32 564 adc $acc3,$t3,xzr // can't overflow 565___ 566} 567$code.=<<___; 568 subs $t2,$acc0,$t0 // "*0xffff0001" 569 sbc $t3,$acc0,$t1 570 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 571 adcs $acc1,$acc2,$t1 572 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 573 adc $acc3,$t3,xzr // can't overflow 574 575 adds $acc0,$acc0,$acc4 // accumulate upper half 576 adcs $acc1,$acc1,$acc5 577 adcs $acc2,$acc2,$acc6 578 adcs $acc3,$acc3,$acc7 579 adc $acc4,xzr,xzr 580 581 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 582 sbcs $t1,$acc1,$poly1 583 sbcs $t2,$acc2,xzr 584 sbcs $t3,$acc3,$poly3 585 sbcs xzr,$acc4,xzr // did it borrow? 586 587 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 588 csel $acc1,$acc1,$t1,lo 589 csel $acc2,$acc2,$t2,lo 590 stp $acc0,$acc1,[$rp] 591 csel $acc3,$acc3,$t3,lo 592 stp $acc2,$acc3,[$rp,#16] 593 594 ret 595.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 596 597// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 598// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 599// contexts, e.g. in multiplication by 2 and 3... 600.type __ecp_nistz256_add,%function 601.align 4 602__ecp_nistz256_add: 603 adds $acc0,$acc0,$t0 // ret = a+b 604 adcs $acc1,$acc1,$t1 605 adcs $acc2,$acc2,$t2 606 adcs $acc3,$acc3,$t3 607 adc $ap,xzr,xzr // zap $ap 608 609 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 610 sbcs $t1,$acc1,$poly1 611 sbcs $t2,$acc2,xzr 612 sbcs $t3,$acc3,$poly3 613 sbcs xzr,$ap,xzr // did subtraction borrow? 614 615 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 616 csel $acc1,$acc1,$t1,lo 617 csel $acc2,$acc2,$t2,lo 618 stp $acc0,$acc1,[$rp] 619 csel $acc3,$acc3,$t3,lo 620 stp $acc2,$acc3,[$rp,#16] 621 622 ret 623.size __ecp_nistz256_add,.-__ecp_nistz256_add 624 625.type __ecp_nistz256_sub_from,%function 626.align 4 627__ecp_nistz256_sub_from: 628 ldp $t0,$t1,[$bp] 629 ldp $t2,$t3,[$bp,#16] 630 subs $acc0,$acc0,$t0 // ret = a-b 631 sbcs $acc1,$acc1,$t1 632 sbcs $acc2,$acc2,$t2 633 sbcs $acc3,$acc3,$t3 634 sbc $ap,xzr,xzr // zap $ap 635 636 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 637 adcs $t1,$acc1,$poly1 638 adcs $t2,$acc2,xzr 639 adc $t3,$acc3,$poly3 640 cmp $ap,xzr // did subtraction borrow? 641 642 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 643 csel $acc1,$acc1,$t1,eq 644 csel $acc2,$acc2,$t2,eq 645 stp $acc0,$acc1,[$rp] 646 csel $acc3,$acc3,$t3,eq 647 stp $acc2,$acc3,[$rp,#16] 648 649 ret 650.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 651 652.type __ecp_nistz256_sub_morf,%function 653.align 4 654__ecp_nistz256_sub_morf: 655 ldp $t0,$t1,[$bp] 656 ldp $t2,$t3,[$bp,#16] 657 subs $acc0,$t0,$acc0 // ret = b-a 658 sbcs $acc1,$t1,$acc1 659 sbcs $acc2,$t2,$acc2 660 sbcs $acc3,$t3,$acc3 661 sbc $ap,xzr,xzr // zap $ap 662 663 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 664 adcs $t1,$acc1,$poly1 665 adcs $t2,$acc2,xzr 666 adc $t3,$acc3,$poly3 667 cmp $ap,xzr // did subtraction borrow? 668 669 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 670 csel $acc1,$acc1,$t1,eq 671 csel $acc2,$acc2,$t2,eq 672 stp $acc0,$acc1,[$rp] 673 csel $acc3,$acc3,$t3,eq 674 stp $acc2,$acc3,[$rp,#16] 675 676 ret 677.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 678 679.type __ecp_nistz256_div_by_2,%function 680.align 4 681__ecp_nistz256_div_by_2: 682 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 683 adcs $t1,$acc1,$poly1 684 adcs $t2,$acc2,xzr 685 adcs $t3,$acc3,$poly3 686 adc $ap,xzr,xzr // zap $ap 687 tst $acc0,#1 // is a even? 688 689 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 690 csel $acc1,$acc1,$t1,eq 691 csel $acc2,$acc2,$t2,eq 692 csel $acc3,$acc3,$t3,eq 693 csel $ap,xzr,$ap,eq 694 695 lsr $acc0,$acc0,#1 // ret >>= 1 696 orr $acc0,$acc0,$acc1,lsl#63 697 lsr $acc1,$acc1,#1 698 orr $acc1,$acc1,$acc2,lsl#63 699 lsr $acc2,$acc2,#1 700 orr $acc2,$acc2,$acc3,lsl#63 701 lsr $acc3,$acc3,#1 702 stp $acc0,$acc1,[$rp] 703 orr $acc3,$acc3,$ap,lsl#63 704 stp $acc2,$acc3,[$rp,#16] 705 706 ret 707.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 708___ 709######################################################################## 710# following subroutines are "literal" implementation of those found in 711# ecp_nistz256.c 712# 713######################################################################## 714# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 715# 716{ 717my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 718# above map() describes stack layout with 4 temporary 719# 256-bit vectors on top. 720my ($rp_real,$ap_real) = map("x$_",(21,22)); 721 722$code.=<<___; 723.globl ecp_nistz256_point_double 724.type ecp_nistz256_point_double,%function 725.align 5 726ecp_nistz256_point_double: 727 .inst 0xd503233f // paciasp 728 stp x29,x30,[sp,#-96]! 729 add x29,sp,#0 730 stp x19,x20,[sp,#16] 731 stp x21,x22,[sp,#32] 732 sub sp,sp,#32*4 733 734.Ldouble_shortcut: 735 ldp $acc0,$acc1,[$ap,#32] 736 mov $rp_real,$rp 737 ldp $acc2,$acc3,[$ap,#48] 738 mov $ap_real,$ap 739 ldr $poly1,.Lpoly+8 740 mov $t0,$acc0 741 ldr $poly3,.Lpoly+24 742 mov $t1,$acc1 743 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 744 mov $t2,$acc2 745 mov $t3,$acc3 746 ldp $a2,$a3,[$ap_real,#64+16] 747 add $rp,sp,#$S 748 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 749 750 add $rp,sp,#$Zsqr 751 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 752 753 ldp $t0,$t1,[$ap_real] 754 ldp $t2,$t3,[$ap_real,#16] 755 mov $a0,$acc0 // put Zsqr aside for p256_sub 756 mov $a1,$acc1 757 mov $a2,$acc2 758 mov $a3,$acc3 759 add $rp,sp,#$M 760 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 761 762 add $bp,$ap_real,#0 763 mov $acc0,$a0 // restore Zsqr 764 mov $acc1,$a1 765 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 766 mov $acc2,$a2 767 mov $acc3,$a3 768 ldp $a2,$a3,[sp,#$S+16] 769 add $rp,sp,#$Zsqr 770 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 771 772 add $rp,sp,#$S 773 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 774 775 ldr $bi,[$ap_real,#32] 776 ldp $a0,$a1,[$ap_real,#64] 777 ldp $a2,$a3,[$ap_real,#64+16] 778 add $bp,$ap_real,#32 779 add $rp,sp,#$tmp0 780 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 781 782 mov $t0,$acc0 783 mov $t1,$acc1 784 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 785 mov $t2,$acc2 786 mov $t3,$acc3 787 ldp $a2,$a3,[sp,#$S+16] 788 add $rp,$rp_real,#64 789 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 790 791 add $rp,sp,#$tmp0 792 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 793 794 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 795 ldp $a0,$a1,[sp,#$M] 796 ldp $a2,$a3,[sp,#$M+16] 797 add $rp,$rp_real,#32 798 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 799 800 add $bp,sp,#$Zsqr 801 add $rp,sp,#$M 802 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 803 804 mov $t0,$acc0 // duplicate M 805 mov $t1,$acc1 806 mov $t2,$acc2 807 mov $t3,$acc3 808 mov $a0,$acc0 // put M aside 809 mov $a1,$acc1 810 mov $a2,$acc2 811 mov $a3,$acc3 812 add $rp,sp,#$M 813 bl __ecp_nistz256_add 814 mov $t0,$a0 // restore M 815 mov $t1,$a1 816 ldr $bi,[$ap_real] // forward load for p256_mul_mont 817 mov $t2,$a2 818 ldp $a0,$a1,[sp,#$S] 819 mov $t3,$a3 820 ldp $a2,$a3,[sp,#$S+16] 821 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 822 823 add $bp,$ap_real,#0 824 add $rp,sp,#$S 825 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 826 827 mov $t0,$acc0 828 mov $t1,$acc1 829 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 830 mov $t2,$acc2 831 mov $t3,$acc3 832 ldp $a2,$a3,[sp,#$M+16] 833 add $rp,sp,#$tmp0 834 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 835 836 add $rp,$rp_real,#0 837 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 838 839 add $bp,sp,#$tmp0 840 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 841 842 add $bp,sp,#$S 843 add $rp,sp,#$S 844 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 845 846 ldr $bi,[sp,#$M] 847 mov $a0,$acc0 // copy S 848 mov $a1,$acc1 849 mov $a2,$acc2 850 mov $a3,$acc3 851 add $bp,sp,#$M 852 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 853 854 add $bp,$rp_real,#32 855 add $rp,$rp_real,#32 856 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 857 858 add sp,x29,#0 // destroy frame 859 ldp x19,x20,[x29,#16] 860 ldp x21,x22,[x29,#32] 861 ldp x29,x30,[sp],#96 862 .inst 0xd50323bf // autiasp 863 ret 864.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 865___ 866} 867 868######################################################################## 869# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 870# const P256_POINT *in2); 871{ 872my ($res_x,$res_y,$res_z, 873 $H,$Hsqr,$R,$Rsqr,$Hcub, 874 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 875my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 876# above map() describes stack layout with 12 temporary 877# 256-bit vectors on top. 878my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); 879 880$code.=<<___; 881.globl ecp_nistz256_point_add 882.type ecp_nistz256_point_add,%function 883.align 5 884ecp_nistz256_point_add: 885 .inst 0xd503233f // paciasp 886 stp x29,x30,[sp,#-96]! 887 add x29,sp,#0 888 stp x19,x20,[sp,#16] 889 stp x21,x22,[sp,#32] 890 stp x23,x24,[sp,#48] 891 stp x25,x26,[sp,#64] 892 stp x27,x28,[sp,#80] 893 sub sp,sp,#32*12 894 895 ldp $a0,$a1,[$bp,#64] // in2_z 896 ldp $a2,$a3,[$bp,#64+16] 897 mov $rp_real,$rp 898 mov $ap_real,$ap 899 mov $bp_real,$bp 900 ldr $poly1,.Lpoly+8 901 ldr $poly3,.Lpoly+24 902 orr $t0,$a0,$a1 903 orr $t2,$a2,$a3 904 orr $in2infty,$t0,$t2 905 cmp $in2infty,#0 906 csetm $in2infty,ne // ~in2infty 907 add $rp,sp,#$Z2sqr 908 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 909 910 ldp $a0,$a1,[$ap_real,#64] // in1_z 911 ldp $a2,$a3,[$ap_real,#64+16] 912 orr $t0,$a0,$a1 913 orr $t2,$a2,$a3 914 orr $in1infty,$t0,$t2 915 cmp $in1infty,#0 916 csetm $in1infty,ne // ~in1infty 917 add $rp,sp,#$Z1sqr 918 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 919 920 ldr $bi,[$bp_real,#64] 921 ldp $a0,$a1,[sp,#$Z2sqr] 922 ldp $a2,$a3,[sp,#$Z2sqr+16] 923 add $bp,$bp_real,#64 924 add $rp,sp,#$S1 925 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 926 927 ldr $bi,[$ap_real,#64] 928 ldp $a0,$a1,[sp,#$Z1sqr] 929 ldp $a2,$a3,[sp,#$Z1sqr+16] 930 add $bp,$ap_real,#64 931 add $rp,sp,#$S2 932 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 933 934 ldr $bi,[$ap_real,#32] 935 ldp $a0,$a1,[sp,#$S1] 936 ldp $a2,$a3,[sp,#$S1+16] 937 add $bp,$ap_real,#32 938 add $rp,sp,#$S1 939 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 940 941 ldr $bi,[$bp_real,#32] 942 ldp $a0,$a1,[sp,#$S2] 943 ldp $a2,$a3,[sp,#$S2+16] 944 add $bp,$bp_real,#32 945 add $rp,sp,#$S2 946 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 947 948 add $bp,sp,#$S1 949 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 950 ldp $a0,$a1,[$ap_real] 951 ldp $a2,$a3,[$ap_real,#16] 952 add $rp,sp,#$R 953 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 954 955 orr $acc0,$acc0,$acc1 // see if result is zero 956 orr $acc2,$acc2,$acc3 957 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) 958 959 add $bp,sp,#$Z2sqr 960 add $rp,sp,#$U1 961 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 962 963 ldr $bi,[sp,#$Z1sqr] 964 ldp $a0,$a1,[$bp_real] 965 ldp $a2,$a3,[$bp_real,#16] 966 add $bp,sp,#$Z1sqr 967 add $rp,sp,#$U2 968 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 969 970 add $bp,sp,#$U1 971 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 972 ldp $a2,$a3,[sp,#$R+16] 973 add $rp,sp,#$H 974 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 975 976 orr $acc0,$acc0,$acc1 // see if result is zero 977 orr $acc2,$acc2,$acc3 978 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) 979 980 mvn $temp1,$in1infty // -1/0 -> 0/-1 981 mvn $temp2,$in2infty // -1/0 -> 0/-1 982 orr $acc0,$acc0,$temp1 983 orr $acc0,$acc0,$temp2 984 orr $acc0,$acc0,$temp0 985 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 986 987.Ladd_double: 988 mov $ap,$ap_real 989 mov $rp,$rp_real 990 ldp x23,x24,[x29,#48] 991 ldp x25,x26,[x29,#64] 992 ldp x27,x28,[x29,#80] 993 add sp,sp,#32*(12-4) // difference in stack frames 994 b .Ldouble_shortcut 995 996.align 4 997.Ladd_proceed: 998 add $rp,sp,#$Rsqr 999 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1000 1001 ldr $bi,[$ap_real,#64] 1002 ldp $a0,$a1,[sp,#$H] 1003 ldp $a2,$a3,[sp,#$H+16] 1004 add $bp,$ap_real,#64 1005 add $rp,sp,#$res_z 1006 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1007 1008 ldp $a0,$a1,[sp,#$H] 1009 ldp $a2,$a3,[sp,#$H+16] 1010 add $rp,sp,#$Hsqr 1011 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1012 1013 ldr $bi,[$bp_real,#64] 1014 ldp $a0,$a1,[sp,#$res_z] 1015 ldp $a2,$a3,[sp,#$res_z+16] 1016 add $bp,$bp_real,#64 1017 add $rp,sp,#$res_z 1018 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 1019 1020 ldr $bi,[sp,#$H] 1021 ldp $a0,$a1,[sp,#$Hsqr] 1022 ldp $a2,$a3,[sp,#$Hsqr+16] 1023 add $bp,sp,#$H 1024 add $rp,sp,#$Hcub 1025 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1026 1027 ldr $bi,[sp,#$Hsqr] 1028 ldp $a0,$a1,[sp,#$U1] 1029 ldp $a2,$a3,[sp,#$U1+16] 1030 add $bp,sp,#$Hsqr 1031 add $rp,sp,#$U2 1032 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 1033 1034 mov $t0,$acc0 1035 mov $t1,$acc1 1036 mov $t2,$acc2 1037 mov $t3,$acc3 1038 add $rp,sp,#$Hsqr 1039 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1040 1041 add $bp,sp,#$Rsqr 1042 add $rp,sp,#$res_x 1043 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1044 1045 add $bp,sp,#$Hcub 1046 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1047 1048 add $bp,sp,#$U2 1049 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 1050 ldp $a0,$a1,[sp,#$S1] 1051 ldp $a2,$a3,[sp,#$S1+16] 1052 add $rp,sp,#$res_y 1053 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1054 1055 add $bp,sp,#$Hcub 1056 add $rp,sp,#$S2 1057 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 1058 1059 ldr $bi,[sp,#$R] 1060 ldp $a0,$a1,[sp,#$res_y] 1061 ldp $a2,$a3,[sp,#$res_y+16] 1062 add $bp,sp,#$R 1063 add $rp,sp,#$res_y 1064 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1065 1066 add $bp,sp,#$S2 1067 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1068 1069 ldp $a0,$a1,[sp,#$res_x] // res 1070 ldp $a2,$a3,[sp,#$res_x+16] 1071 ldp $t0,$t1,[$bp_real] // in2 1072 ldp $t2,$t3,[$bp_real,#16] 1073___ 1074for($i=0;$i<64;$i+=32) { # conditional moves 1075$code.=<<___; 1076 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1077 cmp $in1infty,#0 // ~$in1intfy, remember? 1078 ldp $acc2,$acc3,[$ap_real,#$i+16] 1079 csel $t0,$a0,$t0,ne 1080 csel $t1,$a1,$t1,ne 1081 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1082 csel $t2,$a2,$t2,ne 1083 csel $t3,$a3,$t3,ne 1084 cmp $in2infty,#0 // ~$in2intfy, remember? 1085 ldp $a2,$a3,[sp,#$res_x+$i+48] 1086 csel $acc0,$t0,$acc0,ne 1087 csel $acc1,$t1,$acc1,ne 1088 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1089 csel $acc2,$t2,$acc2,ne 1090 csel $acc3,$t3,$acc3,ne 1091 ldp $t2,$t3,[$bp_real,#$i+48] 1092 stp $acc0,$acc1,[$rp_real,#$i] 1093 stp $acc2,$acc3,[$rp_real,#$i+16] 1094___ 1095} 1096$code.=<<___; 1097 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1098 cmp $in1infty,#0 // ~$in1intfy, remember? 1099 ldp $acc2,$acc3,[$ap_real,#$i+16] 1100 csel $t0,$a0,$t0,ne 1101 csel $t1,$a1,$t1,ne 1102 csel $t2,$a2,$t2,ne 1103 csel $t3,$a3,$t3,ne 1104 cmp $in2infty,#0 // ~$in2intfy, remember? 1105 csel $acc0,$t0,$acc0,ne 1106 csel $acc1,$t1,$acc1,ne 1107 csel $acc2,$t2,$acc2,ne 1108 csel $acc3,$t3,$acc3,ne 1109 stp $acc0,$acc1,[$rp_real,#$i] 1110 stp $acc2,$acc3,[$rp_real,#$i+16] 1111 1112.Ladd_done: 1113 add sp,x29,#0 // destroy frame 1114 ldp x19,x20,[x29,#16] 1115 ldp x21,x22,[x29,#32] 1116 ldp x23,x24,[x29,#48] 1117 ldp x25,x26,[x29,#64] 1118 ldp x27,x28,[x29,#80] 1119 ldp x29,x30,[sp],#96 1120 .inst 0xd50323bf // autiasp 1121 ret 1122.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1123___ 1124} 1125 1126######################################################################## 1127# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1128# const P256_POINT_AFFINE *in2); 1129{ 1130my ($res_x,$res_y,$res_z, 1131 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1132my $Z1sqr = $S2; 1133# above map() describes stack layout with 10 temporary 1134# 256-bit vectors on top. 1135my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1136 1137$code.=<<___; 1138.globl ecp_nistz256_point_add_affine 1139.type ecp_nistz256_point_add_affine,%function 1140.align 5 1141ecp_nistz256_point_add_affine: 1142 .inst 0xd503233f // paciasp 1143 stp x29,x30,[sp,#-80]! 1144 add x29,sp,#0 1145 stp x19,x20,[sp,#16] 1146 stp x21,x22,[sp,#32] 1147 stp x23,x24,[sp,#48] 1148 stp x25,x26,[sp,#64] 1149 sub sp,sp,#32*10 1150 1151 mov $rp_real,$rp 1152 mov $ap_real,$ap 1153 mov $bp_real,$bp 1154 ldr $poly1,.Lpoly+8 1155 ldr $poly3,.Lpoly+24 1156 1157 ldp $a0,$a1,[$ap,#64] // in1_z 1158 ldp $a2,$a3,[$ap,#64+16] 1159 orr $t0,$a0,$a1 1160 orr $t2,$a2,$a3 1161 orr $in1infty,$t0,$t2 1162 cmp $in1infty,#0 1163 csetm $in1infty,ne // ~in1infty 1164 1165 ldp $acc0,$acc1,[$bp] // in2_x 1166 ldp $acc2,$acc3,[$bp,#16] 1167 ldp $t0,$t1,[$bp,#32] // in2_y 1168 ldp $t2,$t3,[$bp,#48] 1169 orr $acc0,$acc0,$acc1 1170 orr $acc2,$acc2,$acc3 1171 orr $t0,$t0,$t1 1172 orr $t2,$t2,$t3 1173 orr $acc0,$acc0,$acc2 1174 orr $t0,$t0,$t2 1175 orr $in2infty,$acc0,$t0 1176 cmp $in2infty,#0 1177 csetm $in2infty,ne // ~in2infty 1178 1179 add $rp,sp,#$Z1sqr 1180 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1181 1182 mov $a0,$acc0 1183 mov $a1,$acc1 1184 mov $a2,$acc2 1185 mov $a3,$acc3 1186 ldr $bi,[$bp_real] 1187 add $bp,$bp_real,#0 1188 add $rp,sp,#$U2 1189 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1190 1191 add $bp,$ap_real,#0 1192 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1193 ldp $a0,$a1,[sp,#$Z1sqr] 1194 ldp $a2,$a3,[sp,#$Z1sqr+16] 1195 add $rp,sp,#$H 1196 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1197 1198 add $bp,$ap_real,#64 1199 add $rp,sp,#$S2 1200 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1201 1202 ldr $bi,[$ap_real,#64] 1203 ldp $a0,$a1,[sp,#$H] 1204 ldp $a2,$a3,[sp,#$H+16] 1205 add $bp,$ap_real,#64 1206 add $rp,sp,#$res_z 1207 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1208 1209 ldr $bi,[$bp_real,#32] 1210 ldp $a0,$a1,[sp,#$S2] 1211 ldp $a2,$a3,[sp,#$S2+16] 1212 add $bp,$bp_real,#32 1213 add $rp,sp,#$S2 1214 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1215 1216 add $bp,$ap_real,#32 1217 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1218 ldp $a2,$a3,[sp,#$H+16] 1219 add $rp,sp,#$R 1220 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1221 1222 add $rp,sp,#$Hsqr 1223 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1224 1225 ldp $a0,$a1,[sp,#$R] 1226 ldp $a2,$a3,[sp,#$R+16] 1227 add $rp,sp,#$Rsqr 1228 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1229 1230 ldr $bi,[sp,#$H] 1231 ldp $a0,$a1,[sp,#$Hsqr] 1232 ldp $a2,$a3,[sp,#$Hsqr+16] 1233 add $bp,sp,#$H 1234 add $rp,sp,#$Hcub 1235 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1236 1237 ldr $bi,[$ap_real] 1238 ldp $a0,$a1,[sp,#$Hsqr] 1239 ldp $a2,$a3,[sp,#$Hsqr+16] 1240 add $bp,$ap_real,#0 1241 add $rp,sp,#$U2 1242 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1243 1244 mov $t0,$acc0 1245 mov $t1,$acc1 1246 mov $t2,$acc2 1247 mov $t3,$acc3 1248 add $rp,sp,#$Hsqr 1249 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1250 1251 add $bp,sp,#$Rsqr 1252 add $rp,sp,#$res_x 1253 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1254 1255 add $bp,sp,#$Hcub 1256 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1257 1258 add $bp,sp,#$U2 1259 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1260 ldp $a0,$a1,[sp,#$Hcub] 1261 ldp $a2,$a3,[sp,#$Hcub+16] 1262 add $rp,sp,#$res_y 1263 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1264 1265 add $bp,$ap_real,#32 1266 add $rp,sp,#$S2 1267 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1268 1269 ldr $bi,[sp,#$R] 1270 ldp $a0,$a1,[sp,#$res_y] 1271 ldp $a2,$a3,[sp,#$res_y+16] 1272 add $bp,sp,#$R 1273 add $rp,sp,#$res_y 1274 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1275 1276 add $bp,sp,#$S2 1277 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1278 1279 ldp $a0,$a1,[sp,#$res_x] // res 1280 ldp $a2,$a3,[sp,#$res_x+16] 1281 ldp $t0,$t1,[$bp_real] // in2 1282 ldp $t2,$t3,[$bp_real,#16] 1283___ 1284for($i=0;$i<64;$i+=32) { # conditional moves 1285$code.=<<___; 1286 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1287 cmp $in1infty,#0 // ~$in1intfy, remember? 1288 ldp $acc2,$acc3,[$ap_real,#$i+16] 1289 csel $t0,$a0,$t0,ne 1290 csel $t1,$a1,$t1,ne 1291 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1292 csel $t2,$a2,$t2,ne 1293 csel $t3,$a3,$t3,ne 1294 cmp $in2infty,#0 // ~$in2intfy, remember? 1295 ldp $a2,$a3,[sp,#$res_x+$i+48] 1296 csel $acc0,$t0,$acc0,ne 1297 csel $acc1,$t1,$acc1,ne 1298 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1299 csel $acc2,$t2,$acc2,ne 1300 csel $acc3,$t3,$acc3,ne 1301 ldp $t2,$t3,[$bp_real,#$i+48] 1302 stp $acc0,$acc1,[$rp_real,#$i] 1303 stp $acc2,$acc3,[$rp_real,#$i+16] 1304___ 1305$code.=<<___ if ($i == 0); 1306 adr $bp_real,.Lone_mont-64 1307___ 1308} 1309$code.=<<___; 1310 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1311 cmp $in1infty,#0 // ~$in1intfy, remember? 1312 ldp $acc2,$acc3,[$ap_real,#$i+16] 1313 csel $t0,$a0,$t0,ne 1314 csel $t1,$a1,$t1,ne 1315 csel $t2,$a2,$t2,ne 1316 csel $t3,$a3,$t3,ne 1317 cmp $in2infty,#0 // ~$in2intfy, remember? 1318 csel $acc0,$t0,$acc0,ne 1319 csel $acc1,$t1,$acc1,ne 1320 csel $acc2,$t2,$acc2,ne 1321 csel $acc3,$t3,$acc3,ne 1322 stp $acc0,$acc1,[$rp_real,#$i] 1323 stp $acc2,$acc3,[$rp_real,#$i+16] 1324 1325 add sp,x29,#0 // destroy frame 1326 ldp x19,x20,[x29,#16] 1327 ldp x21,x22,[x29,#32] 1328 ldp x23,x24,[x29,#48] 1329 ldp x25,x26,[x29,#64] 1330 ldp x29,x30,[sp],#80 1331 .inst 0xd50323bf // autiasp 1332 ret 1333.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1334___ 1335} 1336if (1) { 1337my ($ord0,$ord1) = ($poly1,$poly3); 1338my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1339my $acc7 = $bi; 1340 1341$code.=<<___; 1342//////////////////////////////////////////////////////////////////////// 1343// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1344// uint64_t b[4]); 1345.globl ecp_nistz256_ord_mul_mont 1346.type ecp_nistz256_ord_mul_mont,%function 1347.align 4 1348ecp_nistz256_ord_mul_mont: 1349 stp x29,x30,[sp,#-64]! 1350 add x29,sp,#0 1351 stp x19,x20,[sp,#16] 1352 stp x21,x22,[sp,#32] 1353 stp x23,x24,[sp,#48] 1354 1355 adr $ordk,.Lord 1356 ldr $bi,[$bp] // bp[0] 1357 ldp $a0,$a1,[$ap] 1358 ldp $a2,$a3,[$ap,#16] 1359 1360 ldp $ord0,$ord1,[$ordk,#0] 1361 ldp $ord2,$ord3,[$ordk,#16] 1362 ldr $ordk,[$ordk,#32] 1363 1364 mul $acc0,$a0,$bi // a[0]*b[0] 1365 umulh $t0,$a0,$bi 1366 1367 mul $acc1,$a1,$bi // a[1]*b[0] 1368 umulh $t1,$a1,$bi 1369 1370 mul $acc2,$a2,$bi // a[2]*b[0] 1371 umulh $t2,$a2,$bi 1372 1373 mul $acc3,$a3,$bi // a[3]*b[0] 1374 umulh $acc4,$a3,$bi 1375 1376 mul $t4,$acc0,$ordk 1377 1378 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1379 adcs $acc2,$acc2,$t1 1380 adcs $acc3,$acc3,$t2 1381 adc $acc4,$acc4,xzr 1382 mov $acc5,xzr 1383___ 1384for ($i=1;$i<4;$i++) { 1385 ################################################################ 1386 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1387 # * abcdefgh 1388 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1389 # 1390 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1391 # rewrite above as: 1392 # 1393 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1394 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1395 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1396$code.=<<___; 1397 ldr $bi,[$bp,#8*$i] // b[i] 1398 1399 lsl $t0,$t4,#32 1400 subs $acc2,$acc2,$t4 1401 lsr $t1,$t4,#32 1402 sbcs $acc3,$acc3,$t0 1403 sbcs $acc4,$acc4,$t1 1404 sbc $acc5,$acc5,xzr 1405 1406 subs xzr,$acc0,#1 1407 umulh $t1,$ord0,$t4 1408 mul $t2,$ord1,$t4 1409 umulh $t3,$ord1,$t4 1410 1411 adcs $t2,$t2,$t1 1412 mul $t0,$a0,$bi 1413 adc $t3,$t3,xzr 1414 mul $t1,$a1,$bi 1415 1416 adds $acc0,$acc1,$t2 1417 mul $t2,$a2,$bi 1418 adcs $acc1,$acc2,$t3 1419 mul $t3,$a3,$bi 1420 adcs $acc2,$acc3,$t4 1421 adcs $acc3,$acc4,$t4 1422 adc $acc4,$acc5,xzr 1423 1424 adds $acc0,$acc0,$t0 // accumulate low parts 1425 umulh $t0,$a0,$bi 1426 adcs $acc1,$acc1,$t1 1427 umulh $t1,$a1,$bi 1428 adcs $acc2,$acc2,$t2 1429 umulh $t2,$a2,$bi 1430 adcs $acc3,$acc3,$t3 1431 umulh $t3,$a3,$bi 1432 adc $acc4,$acc4,xzr 1433 mul $t4,$acc0,$ordk 1434 adds $acc1,$acc1,$t0 // accumulate high parts 1435 adcs $acc2,$acc2,$t1 1436 adcs $acc3,$acc3,$t2 1437 adcs $acc4,$acc4,$t3 1438 adc $acc5,xzr,xzr 1439___ 1440} 1441$code.=<<___; 1442 lsl $t0,$t4,#32 // last reduction 1443 subs $acc2,$acc2,$t4 1444 lsr $t1,$t4,#32 1445 sbcs $acc3,$acc3,$t0 1446 sbcs $acc4,$acc4,$t1 1447 sbc $acc5,$acc5,xzr 1448 1449 subs xzr,$acc0,#1 1450 umulh $t1,$ord0,$t4 1451 mul $t2,$ord1,$t4 1452 umulh $t3,$ord1,$t4 1453 1454 adcs $t2,$t2,$t1 1455 adc $t3,$t3,xzr 1456 1457 adds $acc0,$acc1,$t2 1458 adcs $acc1,$acc2,$t3 1459 adcs $acc2,$acc3,$t4 1460 adcs $acc3,$acc4,$t4 1461 adc $acc4,$acc5,xzr 1462 1463 subs $t0,$acc0,$ord0 // ret -= modulus 1464 sbcs $t1,$acc1,$ord1 1465 sbcs $t2,$acc2,$ord2 1466 sbcs $t3,$acc3,$ord3 1467 sbcs xzr,$acc4,xzr 1468 1469 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1470 csel $acc1,$acc1,$t1,lo 1471 csel $acc2,$acc2,$t2,lo 1472 stp $acc0,$acc1,[$rp] 1473 csel $acc3,$acc3,$t3,lo 1474 stp $acc2,$acc3,[$rp,#16] 1475 1476 ldp x19,x20,[sp,#16] 1477 ldp x21,x22,[sp,#32] 1478 ldp x23,x24,[sp,#48] 1479 ldr x29,[sp],#64 1480 ret 1481.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1482 1483//////////////////////////////////////////////////////////////////////// 1484// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1485// uint64_t rep); 1486.globl ecp_nistz256_ord_sqr_mont 1487.type ecp_nistz256_ord_sqr_mont,%function 1488.align 4 1489ecp_nistz256_ord_sqr_mont: 1490 stp x29,x30,[sp,#-64]! 1491 add x29,sp,#0 1492 stp x19,x20,[sp,#16] 1493 stp x21,x22,[sp,#32] 1494 stp x23,x24,[sp,#48] 1495 1496 adr $ordk,.Lord 1497 ldp $a0,$a1,[$ap] 1498 ldp $a2,$a3,[$ap,#16] 1499 1500 ldp $ord0,$ord1,[$ordk,#0] 1501 ldp $ord2,$ord3,[$ordk,#16] 1502 ldr $ordk,[$ordk,#32] 1503 b .Loop_ord_sqr 1504 1505.align 4 1506.Loop_ord_sqr: 1507 sub $bp,$bp,#1 1508 //////////////////////////////////////////////////////////////// 1509 // | | | | | |a1*a0| | 1510 // | | | | |a2*a0| | | 1511 // | |a3*a2|a3*a0| | | | 1512 // | | | |a2*a1| | | | 1513 // | | |a3*a1| | | | | 1514 // *| | | | | | | | 2| 1515 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1516 // |--+--+--+--+--+--+--+--| 1517 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1518 // 1519 // "can't overflow" below mark carrying into high part of 1520 // multiplication result, which can't overflow, because it 1521 // can never be all ones. 1522 1523 mul $acc1,$a1,$a0 // a[1]*a[0] 1524 umulh $t1,$a1,$a0 1525 mul $acc2,$a2,$a0 // a[2]*a[0] 1526 umulh $t2,$a2,$a0 1527 mul $acc3,$a3,$a0 // a[3]*a[0] 1528 umulh $acc4,$a3,$a0 1529 1530 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1531 mul $t0,$a2,$a1 // a[2]*a[1] 1532 umulh $t1,$a2,$a1 1533 adcs $acc3,$acc3,$t2 1534 mul $t2,$a3,$a1 // a[3]*a[1] 1535 umulh $t3,$a3,$a1 1536 adc $acc4,$acc4,xzr // can't overflow 1537 1538 mul $acc5,$a3,$a2 // a[3]*a[2] 1539 umulh $acc6,$a3,$a2 1540 1541 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1542 mul $acc0,$a0,$a0 // a[0]*a[0] 1543 adc $t2,$t3,xzr // can't overflow 1544 1545 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1546 umulh $a0,$a0,$a0 1547 adcs $acc4,$acc4,$t1 1548 mul $t1,$a1,$a1 // a[1]*a[1] 1549 adcs $acc5,$acc5,$t2 1550 umulh $a1,$a1,$a1 1551 adc $acc6,$acc6,xzr // can't overflow 1552 1553 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1554 mul $t2,$a2,$a2 // a[2]*a[2] 1555 adcs $acc2,$acc2,$acc2 1556 umulh $a2,$a2,$a2 1557 adcs $acc3,$acc3,$acc3 1558 mul $t3,$a3,$a3 // a[3]*a[3] 1559 adcs $acc4,$acc4,$acc4 1560 umulh $a3,$a3,$a3 1561 adcs $acc5,$acc5,$acc5 1562 adcs $acc6,$acc6,$acc6 1563 adc $acc7,xzr,xzr 1564 1565 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1566 mul $t4,$acc0,$ordk 1567 adcs $acc2,$acc2,$t1 1568 adcs $acc3,$acc3,$a1 1569 adcs $acc4,$acc4,$t2 1570 adcs $acc5,$acc5,$a2 1571 adcs $acc6,$acc6,$t3 1572 adc $acc7,$acc7,$a3 1573___ 1574for($i=0; $i<4; $i++) { # reductions 1575$code.=<<___; 1576 subs xzr,$acc0,#1 1577 umulh $t1,$ord0,$t4 1578 mul $t2,$ord1,$t4 1579 umulh $t3,$ord1,$t4 1580 1581 adcs $t2,$t2,$t1 1582 adc $t3,$t3,xzr 1583 1584 adds $acc0,$acc1,$t2 1585 adcs $acc1,$acc2,$t3 1586 adcs $acc2,$acc3,$t4 1587 adc $acc3,xzr,$t4 // can't overflow 1588___ 1589$code.=<<___ if ($i<3); 1590 mul $t3,$acc0,$ordk 1591___ 1592$code.=<<___; 1593 lsl $t0,$t4,#32 1594 subs $acc1,$acc1,$t4 1595 lsr $t1,$t4,#32 1596 sbcs $acc2,$acc2,$t0 1597 sbc $acc3,$acc3,$t1 // can't borrow 1598___ 1599 ($t3,$t4) = ($t4,$t3); 1600} 1601$code.=<<___; 1602 adds $acc0,$acc0,$acc4 // accumulate upper half 1603 adcs $acc1,$acc1,$acc5 1604 adcs $acc2,$acc2,$acc6 1605 adcs $acc3,$acc3,$acc7 1606 adc $acc4,xzr,xzr 1607 1608 subs $t0,$acc0,$ord0 // ret -= modulus 1609 sbcs $t1,$acc1,$ord1 1610 sbcs $t2,$acc2,$ord2 1611 sbcs $t3,$acc3,$ord3 1612 sbcs xzr,$acc4,xzr 1613 1614 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1615 csel $a1,$acc1,$t1,lo 1616 csel $a2,$acc2,$t2,lo 1617 csel $a3,$acc3,$t3,lo 1618 1619 cbnz $bp,.Loop_ord_sqr 1620 1621 stp $a0,$a1,[$rp] 1622 stp $a2,$a3,[$rp,#16] 1623 1624 ldp x19,x20,[sp,#16] 1625 ldp x21,x22,[sp,#32] 1626 ldp x23,x24,[sp,#48] 1627 ldr x29,[sp],#64 1628 ret 1629.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1630___ 1631} } 1632 1633######################################################################## 1634# scatter-gather subroutines 1635{ 1636my ($out,$inp,$index,$mask)=map("x$_",(0..3)); 1637$code.=<<___; 1638// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, 1639// int x2); 1640.globl ecp_nistz256_scatter_w5 1641.type ecp_nistz256_scatter_w5,%function 1642.align 4 1643ecp_nistz256_scatter_w5: 1644 stp x29,x30,[sp,#-16]! 1645 add x29,sp,#0 1646 1647 add $out,$out,$index,lsl#2 1648 1649 ldp x4,x5,[$inp] // X 1650 ldp x6,x7,[$inp,#16] 1651 stur w4,[$out,#64*0-4] 1652 lsr x4,x4,#32 1653 str w5,[$out,#64*1-4] 1654 lsr x5,x5,#32 1655 str w6,[$out,#64*2-4] 1656 lsr x6,x6,#32 1657 str w7,[$out,#64*3-4] 1658 lsr x7,x7,#32 1659 str w4,[$out,#64*4-4] 1660 str w5,[$out,#64*5-4] 1661 str w6,[$out,#64*6-4] 1662 str w7,[$out,#64*7-4] 1663 add $out,$out,#64*8 1664 1665 ldp x4,x5,[$inp,#32] // Y 1666 ldp x6,x7,[$inp,#48] 1667 stur w4,[$out,#64*0-4] 1668 lsr x4,x4,#32 1669 str w5,[$out,#64*1-4] 1670 lsr x5,x5,#32 1671 str w6,[$out,#64*2-4] 1672 lsr x6,x6,#32 1673 str w7,[$out,#64*3-4] 1674 lsr x7,x7,#32 1675 str w4,[$out,#64*4-4] 1676 str w5,[$out,#64*5-4] 1677 str w6,[$out,#64*6-4] 1678 str w7,[$out,#64*7-4] 1679 add $out,$out,#64*8 1680 1681 ldp x4,x5,[$inp,#64] // Z 1682 ldp x6,x7,[$inp,#80] 1683 stur w4,[$out,#64*0-4] 1684 lsr x4,x4,#32 1685 str w5,[$out,#64*1-4] 1686 lsr x5,x5,#32 1687 str w6,[$out,#64*2-4] 1688 lsr x6,x6,#32 1689 str w7,[$out,#64*3-4] 1690 lsr x7,x7,#32 1691 str w4,[$out,#64*4-4] 1692 str w5,[$out,#64*5-4] 1693 str w6,[$out,#64*6-4] 1694 str w7,[$out,#64*7-4] 1695 1696 ldr x29,[sp],#16 1697 ret 1698.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1699 1700// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, 1701// int x2); 1702.globl ecp_nistz256_gather_w5 1703.type ecp_nistz256_gather_w5,%function 1704.align 4 1705ecp_nistz256_gather_w5: 1706 stp x29,x30,[sp,#-16]! 1707 add x29,sp,#0 1708 1709 cmp $index,xzr 1710 csetm x3,ne 1711 add $index,$index,x3 1712 add $inp,$inp,$index,lsl#2 1713 1714 ldr w4,[$inp,#64*0] 1715 ldr w5,[$inp,#64*1] 1716 ldr w6,[$inp,#64*2] 1717 ldr w7,[$inp,#64*3] 1718 ldr w8,[$inp,#64*4] 1719 ldr w9,[$inp,#64*5] 1720 ldr w10,[$inp,#64*6] 1721 ldr w11,[$inp,#64*7] 1722 add $inp,$inp,#64*8 1723 orr x4,x4,x8,lsl#32 1724 orr x5,x5,x9,lsl#32 1725 orr x6,x6,x10,lsl#32 1726 orr x7,x7,x11,lsl#32 1727 csel x4,x4,xzr,ne 1728 csel x5,x5,xzr,ne 1729 csel x6,x6,xzr,ne 1730 csel x7,x7,xzr,ne 1731 stp x4,x5,[$out] // X 1732 stp x6,x7,[$out,#16] 1733 1734 ldr w4,[$inp,#64*0] 1735 ldr w5,[$inp,#64*1] 1736 ldr w6,[$inp,#64*2] 1737 ldr w7,[$inp,#64*3] 1738 ldr w8,[$inp,#64*4] 1739 ldr w9,[$inp,#64*5] 1740 ldr w10,[$inp,#64*6] 1741 ldr w11,[$inp,#64*7] 1742 add $inp,$inp,#64*8 1743 orr x4,x4,x8,lsl#32 1744 orr x5,x5,x9,lsl#32 1745 orr x6,x6,x10,lsl#32 1746 orr x7,x7,x11,lsl#32 1747 csel x4,x4,xzr,ne 1748 csel x5,x5,xzr,ne 1749 csel x6,x6,xzr,ne 1750 csel x7,x7,xzr,ne 1751 stp x4,x5,[$out,#32] // Y 1752 stp x6,x7,[$out,#48] 1753 1754 ldr w4,[$inp,#64*0] 1755 ldr w5,[$inp,#64*1] 1756 ldr w6,[$inp,#64*2] 1757 ldr w7,[$inp,#64*3] 1758 ldr w8,[$inp,#64*4] 1759 ldr w9,[$inp,#64*5] 1760 ldr w10,[$inp,#64*6] 1761 ldr w11,[$inp,#64*7] 1762 orr x4,x4,x8,lsl#32 1763 orr x5,x5,x9,lsl#32 1764 orr x6,x6,x10,lsl#32 1765 orr x7,x7,x11,lsl#32 1766 csel x4,x4,xzr,ne 1767 csel x5,x5,xzr,ne 1768 csel x6,x6,xzr,ne 1769 csel x7,x7,xzr,ne 1770 stp x4,x5,[$out,#64] // Z 1771 stp x6,x7,[$out,#80] 1772 1773 ldr x29,[sp],#16 1774 ret 1775.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1776 1777// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, 1778// int x2); 1779.globl ecp_nistz256_scatter_w7 1780.type ecp_nistz256_scatter_w7,%function 1781.align 4 1782ecp_nistz256_scatter_w7: 1783 stp x29,x30,[sp,#-16]! 1784 add x29,sp,#0 1785 1786 add $out,$out,$index 1787 mov $index,#64/8 1788.Loop_scatter_w7: 1789 ldr x3,[$inp],#8 1790 subs $index,$index,#1 1791 prfm pstl1strm,[$out,#4096+64*0] 1792 prfm pstl1strm,[$out,#4096+64*1] 1793 prfm pstl1strm,[$out,#4096+64*2] 1794 prfm pstl1strm,[$out,#4096+64*3] 1795 prfm pstl1strm,[$out,#4096+64*4] 1796 prfm pstl1strm,[$out,#4096+64*5] 1797 prfm pstl1strm,[$out,#4096+64*6] 1798 prfm pstl1strm,[$out,#4096+64*7] 1799 strb w3,[$out,#64*0] 1800 lsr x3,x3,#8 1801 strb w3,[$out,#64*1] 1802 lsr x3,x3,#8 1803 strb w3,[$out,#64*2] 1804 lsr x3,x3,#8 1805 strb w3,[$out,#64*3] 1806 lsr x3,x3,#8 1807 strb w3,[$out,#64*4] 1808 lsr x3,x3,#8 1809 strb w3,[$out,#64*5] 1810 lsr x3,x3,#8 1811 strb w3,[$out,#64*6] 1812 lsr x3,x3,#8 1813 strb w3,[$out,#64*7] 1814 add $out,$out,#64*8 1815 b.ne .Loop_scatter_w7 1816 1817 ldr x29,[sp],#16 1818 ret 1819.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1820 1821// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, 1822// int x2); 1823.globl ecp_nistz256_gather_w7 1824.type ecp_nistz256_gather_w7,%function 1825.align 4 1826ecp_nistz256_gather_w7: 1827 stp x29,x30,[sp,#-16]! 1828 add x29,sp,#0 1829 1830 cmp $index,xzr 1831 csetm x3,ne 1832 add $index,$index,x3 1833 add $inp,$inp,$index 1834 mov $index,#64/8 1835 nop 1836.Loop_gather_w7: 1837 ldrb w4,[$inp,#64*0] 1838 prfm pldl1strm,[$inp,#4096+64*0] 1839 subs $index,$index,#1 1840 ldrb w5,[$inp,#64*1] 1841 prfm pldl1strm,[$inp,#4096+64*1] 1842 ldrb w6,[$inp,#64*2] 1843 prfm pldl1strm,[$inp,#4096+64*2] 1844 ldrb w7,[$inp,#64*3] 1845 prfm pldl1strm,[$inp,#4096+64*3] 1846 ldrb w8,[$inp,#64*4] 1847 prfm pldl1strm,[$inp,#4096+64*4] 1848 ldrb w9,[$inp,#64*5] 1849 prfm pldl1strm,[$inp,#4096+64*5] 1850 ldrb w10,[$inp,#64*6] 1851 prfm pldl1strm,[$inp,#4096+64*6] 1852 ldrb w11,[$inp,#64*7] 1853 prfm pldl1strm,[$inp,#4096+64*7] 1854 add $inp,$inp,#64*8 1855 orr x4,x4,x5,lsl#8 1856 orr x6,x6,x7,lsl#8 1857 orr x8,x8,x9,lsl#8 1858 orr x4,x4,x6,lsl#16 1859 orr x10,x10,x11,lsl#8 1860 orr x4,x4,x8,lsl#32 1861 orr x4,x4,x10,lsl#48 1862 and x4,x4,x3 1863 str x4,[$out],#8 1864 b.ne .Loop_gather_w7 1865 1866 ldr x29,[sp],#16 1867 ret 1868.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1869___ 1870} 1871 1872foreach (split("\n",$code)) { 1873 s/\`([^\`]*)\`/eval $1/ge; 1874 1875 print $_,"\n"; 1876} 1877close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1878