1*0a6a1f1dSLionel Sambuc#include "arm_arch.h" 2*0a6a1f1dSLionel Sambuc#include "arm_asm.h" 3*0a6a1f1dSLionel Sambuc 4*0a6a1f1dSLionel Sambuc.text 5*0a6a1f1dSLionel Sambuc.code 32 6*0a6a1f1dSLionel Sambuc 7*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 8*0a6a1f1dSLionel Sambuc.align 5 9*0a6a1f1dSLionel Sambuc.LOPENSSL_armcap: 10*0a6a1f1dSLionel Sambuc.word OPENSSL_armcap_P-bn_mul_mont 11*0a6a1f1dSLionel Sambuc#endif 12*0a6a1f1dSLionel Sambuc 13*0a6a1f1dSLionel Sambuc.global bn_mul_mont 14*0a6a1f1dSLionel Sambuc.type bn_mul_mont,%function 15*0a6a1f1dSLionel Sambuc 16*0a6a1f1dSLionel Sambuc.align 5 17*0a6a1f1dSLionel Sambucbn_mul_mont: 18*0a6a1f1dSLionel Sambuc ldr ip,[sp,#4] @ load num 19*0a6a1f1dSLionel Sambuc stmdb sp!,{r0,r2} @ sp points at argument block 20*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 21*0a6a1f1dSLionel Sambuc tst ip,#7 22*0a6a1f1dSLionel Sambuc bne .Lialu 23*0a6a1f1dSLionel Sambuc adr r0,bn_mul_mont 24*0a6a1f1dSLionel Sambuc ldr r2,.LOPENSSL_armcap 25*0a6a1f1dSLionel Sambuc ldr r0,[r0,r2] 26*0a6a1f1dSLionel Sambuc tst r0,#1 @ NEON available? 27*0a6a1f1dSLionel Sambuc ldmia sp, {r0,r2} 28*0a6a1f1dSLionel Sambuc beq .Lialu 29*0a6a1f1dSLionel Sambuc add sp,sp,#8 30*0a6a1f1dSLionel Sambuc b bn_mul8x_mont_neon 31*0a6a1f1dSLionel Sambuc.align 4 32*0a6a1f1dSLionel Sambuc.Lialu: 33*0a6a1f1dSLionel Sambuc#endif 34*0a6a1f1dSLionel Sambuc cmp ip,#2 35*0a6a1f1dSLionel Sambuc mov r0,ip @ load num 36*0a6a1f1dSLionel Sambuc movlt r0,#0 37*0a6a1f1dSLionel Sambuc addlt sp,sp,#2*4 38*0a6a1f1dSLionel Sambuc blt .Labrt 39*0a6a1f1dSLionel Sambuc 40*0a6a1f1dSLionel Sambuc stmdb sp!,{r4-r12,lr} @ save 10 registers 41*0a6a1f1dSLionel Sambuc 42*0a6a1f1dSLionel Sambuc mov r0,r0,lsl#2 @ rescale r0 for byte count 43*0a6a1f1dSLionel Sambuc sub sp,sp,r0 @ alloca(4*num) 44*0a6a1f1dSLionel Sambuc sub sp,sp,#4 @ +extra dword 45*0a6a1f1dSLionel Sambuc sub r0,r0,#4 @ "num=num-1" 46*0a6a1f1dSLionel Sambuc add r4,r2,r0 @ &bp[num-1] 47*0a6a1f1dSLionel Sambuc 48*0a6a1f1dSLionel Sambuc add r0,sp,r0 @ r0 to point at &tp[num-1] 49*0a6a1f1dSLionel Sambuc ldr r8,[r0,#14*4] @ &n0 50*0a6a1f1dSLionel Sambuc ldr r2,[r2] @ bp[0] 51*0a6a1f1dSLionel Sambuc ldr r5,[r1],#4 @ ap[0],ap++ 52*0a6a1f1dSLionel Sambuc ldr r6,[r3],#4 @ np[0],np++ 53*0a6a1f1dSLionel Sambuc ldr r8,[r8] @ *n0 54*0a6a1f1dSLionel Sambuc str r4,[r0,#15*4] @ save &bp[num] 55*0a6a1f1dSLionel Sambuc 56*0a6a1f1dSLionel Sambuc umull r10,r11,r5,r2 @ ap[0]*bp[0] 57*0a6a1f1dSLionel Sambuc str r8,[r0,#14*4] @ save n0 value 58*0a6a1f1dSLionel Sambuc mul r8,r10,r8 @ "tp[0]"*n0 59*0a6a1f1dSLionel Sambuc mov r12,#0 60*0a6a1f1dSLionel Sambuc umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" 61*0a6a1f1dSLionel Sambuc mov r4,sp 62*0a6a1f1dSLionel Sambuc 63*0a6a1f1dSLionel Sambuc.L1st: 64*0a6a1f1dSLionel Sambuc ldr r5,[r1],#4 @ ap[j],ap++ 65*0a6a1f1dSLionel Sambuc mov r10,r11 66*0a6a1f1dSLionel Sambuc ldr r6,[r3],#4 @ np[j],np++ 67*0a6a1f1dSLionel Sambuc mov r11,#0 68*0a6a1f1dSLionel Sambuc umlal r10,r11,r5,r2 @ ap[j]*bp[0] 69*0a6a1f1dSLionel Sambuc mov r14,#0 70*0a6a1f1dSLionel Sambuc umlal r12,r14,r6,r8 @ np[j]*n0 71*0a6a1f1dSLionel Sambuc adds r12,r12,r10 72*0a6a1f1dSLionel Sambuc str r12,[r4],#4 @ tp[j-1]=,tp++ 73*0a6a1f1dSLionel Sambuc adc r12,r14,#0 74*0a6a1f1dSLionel Sambuc cmp r4,r0 75*0a6a1f1dSLionel Sambuc bne .L1st 76*0a6a1f1dSLionel Sambuc 77*0a6a1f1dSLionel Sambuc adds r12,r12,r11 78*0a6a1f1dSLionel Sambuc ldr r4,[r0,#13*4] @ restore bp 79*0a6a1f1dSLionel Sambuc mov r14,#0 80*0a6a1f1dSLionel Sambuc ldr r8,[r0,#14*4] @ restore n0 81*0a6a1f1dSLionel Sambuc adc r14,r14,#0 82*0a6a1f1dSLionel Sambuc str r12,[r0] @ tp[num-1]= 83*0a6a1f1dSLionel Sambuc str r14,[r0,#4] @ tp[num]= 84*0a6a1f1dSLionel Sambuc 85*0a6a1f1dSLionel Sambuc.Louter: 86*0a6a1f1dSLionel Sambuc sub r7,r0,sp @ "original" r0-1 value 87*0a6a1f1dSLionel Sambuc sub r1,r1,r7 @ "rewind" ap to &ap[1] 88*0a6a1f1dSLionel Sambuc ldr r2,[r4,#4]! @ *(++bp) 89*0a6a1f1dSLionel Sambuc sub r3,r3,r7 @ "rewind" np to &np[1] 90*0a6a1f1dSLionel Sambuc ldr r5,[r1,#-4] @ ap[0] 91*0a6a1f1dSLionel Sambuc ldr r10,[sp] @ tp[0] 92*0a6a1f1dSLionel Sambuc ldr r6,[r3,#-4] @ np[0] 93*0a6a1f1dSLionel Sambuc ldr r7,[sp,#4] @ tp[1] 94*0a6a1f1dSLionel Sambuc 95*0a6a1f1dSLionel Sambuc mov r11,#0 96*0a6a1f1dSLionel Sambuc umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] 97*0a6a1f1dSLionel Sambuc str r4,[r0,#13*4] @ save bp 98*0a6a1f1dSLionel Sambuc mul r8,r10,r8 99*0a6a1f1dSLionel Sambuc mov r12,#0 100*0a6a1f1dSLionel Sambuc umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" 101*0a6a1f1dSLionel Sambuc mov r4,sp 102*0a6a1f1dSLionel Sambuc 103*0a6a1f1dSLionel Sambuc.Linner: 104*0a6a1f1dSLionel Sambuc ldr r5,[r1],#4 @ ap[j],ap++ 105*0a6a1f1dSLionel Sambuc adds r10,r11,r7 @ +=tp[j] 106*0a6a1f1dSLionel Sambuc ldr r6,[r3],#4 @ np[j],np++ 107*0a6a1f1dSLionel Sambuc mov r11,#0 108*0a6a1f1dSLionel Sambuc umlal r10,r11,r5,r2 @ ap[j]*bp[i] 109*0a6a1f1dSLionel Sambuc mov r14,#0 110*0a6a1f1dSLionel Sambuc umlal r12,r14,r6,r8 @ np[j]*n0 111*0a6a1f1dSLionel Sambuc adc r11,r11,#0 112*0a6a1f1dSLionel Sambuc ldr r7,[r4,#8] @ tp[j+1] 113*0a6a1f1dSLionel Sambuc adds r12,r12,r10 114*0a6a1f1dSLionel Sambuc str r12,[r4],#4 @ tp[j-1]=,tp++ 115*0a6a1f1dSLionel Sambuc adc r12,r14,#0 116*0a6a1f1dSLionel Sambuc cmp r4,r0 117*0a6a1f1dSLionel Sambuc bne .Linner 118*0a6a1f1dSLionel Sambuc 119*0a6a1f1dSLionel Sambuc adds r12,r12,r11 120*0a6a1f1dSLionel Sambuc mov r14,#0 121*0a6a1f1dSLionel Sambuc ldr r4,[r0,#13*4] @ restore bp 122*0a6a1f1dSLionel Sambuc adc r14,r14,#0 123*0a6a1f1dSLionel Sambuc ldr r8,[r0,#14*4] @ restore n0 124*0a6a1f1dSLionel Sambuc adds r12,r12,r7 125*0a6a1f1dSLionel Sambuc ldr r7,[r0,#15*4] @ restore &bp[num] 126*0a6a1f1dSLionel Sambuc adc r14,r14,#0 127*0a6a1f1dSLionel Sambuc str r12,[r0] @ tp[num-1]= 128*0a6a1f1dSLionel Sambuc str r14,[r0,#4] @ tp[num]= 129*0a6a1f1dSLionel Sambuc 130*0a6a1f1dSLionel Sambuc cmp r4,r7 131*0a6a1f1dSLionel Sambuc bne .Louter 132*0a6a1f1dSLionel Sambuc 133*0a6a1f1dSLionel Sambuc ldr r2,[r0,#12*4] @ pull rp 134*0a6a1f1dSLionel Sambuc add r0,r0,#4 @ r0 to point at &tp[num] 135*0a6a1f1dSLionel Sambuc sub r5,r0,sp @ "original" num value 136*0a6a1f1dSLionel Sambuc mov r4,sp @ "rewind" r4 137*0a6a1f1dSLionel Sambuc mov r1,r4 @ "borrow" r1 138*0a6a1f1dSLionel Sambuc sub r3,r3,r5 @ "rewind" r3 to &np[0] 139*0a6a1f1dSLionel Sambuc 140*0a6a1f1dSLionel Sambuc subs r7,r7,r7 @ "clear" carry flag 141*0a6a1f1dSLionel Sambuc.Lsub: ldr r7,[r4],#4 142*0a6a1f1dSLionel Sambuc ldr r6,[r3],#4 143*0a6a1f1dSLionel Sambuc sbcs r7,r7,r6 @ tp[j]-np[j] 144*0a6a1f1dSLionel Sambuc str r7,[r2],#4 @ rp[j]= 145*0a6a1f1dSLionel Sambuc teq r4,r0 @ preserve carry 146*0a6a1f1dSLionel Sambuc bne .Lsub 147*0a6a1f1dSLionel Sambuc sbcs r14,r14,#0 @ upmost carry 148*0a6a1f1dSLionel Sambuc mov r4,sp @ "rewind" r4 149*0a6a1f1dSLionel Sambuc sub r2,r2,r5 @ "rewind" r2 150*0a6a1f1dSLionel Sambuc 151*0a6a1f1dSLionel Sambuc and r1,r4,r14 152*0a6a1f1dSLionel Sambuc bic r3,r2,r14 153*0a6a1f1dSLionel Sambuc orr r1,r1,r3 @ ap=borrow?tp:rp 154*0a6a1f1dSLionel Sambuc 155*0a6a1f1dSLionel Sambuc.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh 156*0a6a1f1dSLionel Sambuc str sp,[r4],#4 @ zap tp 157*0a6a1f1dSLionel Sambuc str r7,[r2],#4 158*0a6a1f1dSLionel Sambuc cmp r4,r0 159*0a6a1f1dSLionel Sambuc bne .Lcopy 160*0a6a1f1dSLionel Sambuc 161*0a6a1f1dSLionel Sambuc add sp,r0,#4 @ skip over tp[num+1] 162*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r12,lr} @ restore registers 163*0a6a1f1dSLionel Sambuc add sp,sp,#2*4 @ skip over {r0,r2} 164*0a6a1f1dSLionel Sambuc mov r0,#1 165*0a6a1f1dSLionel Sambuc.Labrt: 166*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=5 167*0a6a1f1dSLionel Sambuc RET @ .word 0xe12fff1e 168*0a6a1f1dSLionel Sambuc#else 169*0a6a1f1dSLionel Sambuc tst lr,#1 170*0a6a1f1dSLionel Sambuc moveq pc,lr @ be binary compatible with V4, yet 171*0a6a1f1dSLionel Sambuc .word 0xe12fff1e @ interoperable with Thumb ISA:-) 172*0a6a1f1dSLionel Sambuc#endif 173*0a6a1f1dSLionel Sambuc.size bn_mul_mont,.-bn_mul_mont 174*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 175*0a6a1f1dSLionel Sambuc.arch armv7-a 176*0a6a1f1dSLionel Sambuc.fpu neon 177*0a6a1f1dSLionel Sambuc 178*0a6a1f1dSLionel Sambuc.type bn_mul8x_mont_neon,%function 179*0a6a1f1dSLionel Sambuc.align 5 180*0a6a1f1dSLionel Sambucbn_mul8x_mont_neon: 181*0a6a1f1dSLionel Sambuc mov ip,sp 182*0a6a1f1dSLionel Sambuc stmdb sp!,{r4-r11} 183*0a6a1f1dSLionel Sambuc vstmdb sp!,{d8-d15} @ ABI specification says so 184*0a6a1f1dSLionel Sambuc ldmia ip,{r4-r5} @ load rest of parameter block 185*0a6a1f1dSLionel Sambuc 186*0a6a1f1dSLionel Sambuc sub r7,sp,#16 187*0a6a1f1dSLionel Sambuc vld1.32 {d28[0]}, [r2,:32]! 188*0a6a1f1dSLionel Sambuc sub r7,r7,r5,lsl#4 189*0a6a1f1dSLionel Sambuc vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-( 190*0a6a1f1dSLionel Sambuc and r7,r7,#-64 191*0a6a1f1dSLionel Sambuc vld1.32 {d30[0]}, [r4,:32] 192*0a6a1f1dSLionel Sambuc mov sp,r7 @ alloca 193*0a6a1f1dSLionel Sambuc veor d8,d8,d8 194*0a6a1f1dSLionel Sambuc subs r8,r5,#8 195*0a6a1f1dSLionel Sambuc vzip.16 d28,d8 196*0a6a1f1dSLionel Sambuc 197*0a6a1f1dSLionel Sambuc vmull.u32 q6,d28,d0[0] 198*0a6a1f1dSLionel Sambuc vmull.u32 q7,d28,d0[1] 199*0a6a1f1dSLionel Sambuc vmull.u32 q8,d28,d1[0] 200*0a6a1f1dSLionel Sambuc vshl.i64 d10,d13,#16 201*0a6a1f1dSLionel Sambuc vmull.u32 q9,d28,d1[1] 202*0a6a1f1dSLionel Sambuc 203*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d12 204*0a6a1f1dSLionel Sambuc veor d8,d8,d8 205*0a6a1f1dSLionel Sambuc vmul.u32 d29,d10,d30 206*0a6a1f1dSLionel Sambuc 207*0a6a1f1dSLionel Sambuc vmull.u32 q10,d28,d2[0] 208*0a6a1f1dSLionel Sambuc vld1.32 {d4-d7}, [r3]! 209*0a6a1f1dSLionel Sambuc vmull.u32 q11,d28,d2[1] 210*0a6a1f1dSLionel Sambuc vmull.u32 q12,d28,d3[0] 211*0a6a1f1dSLionel Sambuc vzip.16 d29,d8 212*0a6a1f1dSLionel Sambuc vmull.u32 q13,d28,d3[1] 213*0a6a1f1dSLionel Sambuc 214*0a6a1f1dSLionel Sambuc bne .LNEON_1st 215*0a6a1f1dSLionel Sambuc 216*0a6a1f1dSLionel Sambuc @ special case for num=8, everything is in register bank... 217*0a6a1f1dSLionel Sambuc 218*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d29,d4[0] 219*0a6a1f1dSLionel Sambuc sub r9,r5,#1 220*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d29,d4[1] 221*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d29,d5[0] 222*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d29,d5[1] 223*0a6a1f1dSLionel Sambuc 224*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d29,d6[0] 225*0a6a1f1dSLionel Sambuc vmov q5,q6 226*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d29,d6[1] 227*0a6a1f1dSLionel Sambuc vmov q6,q7 228*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d29,d7[0] 229*0a6a1f1dSLionel Sambuc vmov q7,q8 230*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d29,d7[1] 231*0a6a1f1dSLionel Sambuc vmov q8,q9 232*0a6a1f1dSLionel Sambuc vmov q9,q10 233*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 234*0a6a1f1dSLionel Sambuc vmov q10,q11 235*0a6a1f1dSLionel Sambuc vmov q11,q12 236*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d11 237*0a6a1f1dSLionel Sambuc vmov q12,q13 238*0a6a1f1dSLionel Sambuc veor q13,q13 239*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 240*0a6a1f1dSLionel Sambuc 241*0a6a1f1dSLionel Sambuc b .LNEON_outer8 242*0a6a1f1dSLionel Sambuc 243*0a6a1f1dSLionel Sambuc.align 4 244*0a6a1f1dSLionel Sambuc.LNEON_outer8: 245*0a6a1f1dSLionel Sambuc vld1.32 {d28[0]}, [r2,:32]! 246*0a6a1f1dSLionel Sambuc veor d8,d8,d8 247*0a6a1f1dSLionel Sambuc vzip.16 d28,d8 248*0a6a1f1dSLionel Sambuc vadd.u64 d12,d12,d10 249*0a6a1f1dSLionel Sambuc 250*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d28,d0[0] 251*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d28,d0[1] 252*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d28,d1[0] 253*0a6a1f1dSLionel Sambuc vshl.i64 d10,d13,#16 254*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d28,d1[1] 255*0a6a1f1dSLionel Sambuc 256*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d12 257*0a6a1f1dSLionel Sambuc veor d8,d8,d8 258*0a6a1f1dSLionel Sambuc subs r9,r9,#1 259*0a6a1f1dSLionel Sambuc vmul.u32 d29,d10,d30 260*0a6a1f1dSLionel Sambuc 261*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d28,d2[0] 262*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d28,d2[1] 263*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d28,d3[0] 264*0a6a1f1dSLionel Sambuc vzip.16 d29,d8 265*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d28,d3[1] 266*0a6a1f1dSLionel Sambuc 267*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d29,d4[0] 268*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d29,d4[1] 269*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d29,d5[0] 270*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d29,d5[1] 271*0a6a1f1dSLionel Sambuc 272*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d29,d6[0] 273*0a6a1f1dSLionel Sambuc vmov q5,q6 274*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d29,d6[1] 275*0a6a1f1dSLionel Sambuc vmov q6,q7 276*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d29,d7[0] 277*0a6a1f1dSLionel Sambuc vmov q7,q8 278*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d29,d7[1] 279*0a6a1f1dSLionel Sambuc vmov q8,q9 280*0a6a1f1dSLionel Sambuc vmov q9,q10 281*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 282*0a6a1f1dSLionel Sambuc vmov q10,q11 283*0a6a1f1dSLionel Sambuc vmov q11,q12 284*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d11 285*0a6a1f1dSLionel Sambuc vmov q12,q13 286*0a6a1f1dSLionel Sambuc veor q13,q13 287*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 288*0a6a1f1dSLionel Sambuc 289*0a6a1f1dSLionel Sambuc bne .LNEON_outer8 290*0a6a1f1dSLionel Sambuc 291*0a6a1f1dSLionel Sambuc vadd.u64 d12,d12,d10 292*0a6a1f1dSLionel Sambuc mov r7,sp 293*0a6a1f1dSLionel Sambuc vshr.u64 d10,d12,#16 294*0a6a1f1dSLionel Sambuc mov r8,r5 295*0a6a1f1dSLionel Sambuc vadd.u64 d13,d13,d10 296*0a6a1f1dSLionel Sambuc add r6,sp,#16 297*0a6a1f1dSLionel Sambuc vshr.u64 d10,d13,#16 298*0a6a1f1dSLionel Sambuc vzip.16 d12,d13 299*0a6a1f1dSLionel Sambuc 300*0a6a1f1dSLionel Sambuc b .LNEON_tail2 301*0a6a1f1dSLionel Sambuc 302*0a6a1f1dSLionel Sambuc.align 4 303*0a6a1f1dSLionel Sambuc.LNEON_1st: 304*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d29,d4[0] 305*0a6a1f1dSLionel Sambuc vld1.32 {d0-d3}, [r1]! 306*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d29,d4[1] 307*0a6a1f1dSLionel Sambuc subs r8,r8,#8 308*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d29,d5[0] 309*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d29,d5[1] 310*0a6a1f1dSLionel Sambuc 311*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d29,d6[0] 312*0a6a1f1dSLionel Sambuc vld1.32 {d4-d5}, [r3]! 313*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d29,d6[1] 314*0a6a1f1dSLionel Sambuc vst1.64 {q6-q7}, [r7,:256]! 315*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d29,d7[0] 316*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d29,d7[1] 317*0a6a1f1dSLionel Sambuc vst1.64 {q8-q9}, [r7,:256]! 318*0a6a1f1dSLionel Sambuc 319*0a6a1f1dSLionel Sambuc vmull.u32 q6,d28,d0[0] 320*0a6a1f1dSLionel Sambuc vld1.32 {d6-d7}, [r3]! 321*0a6a1f1dSLionel Sambuc vmull.u32 q7,d28,d0[1] 322*0a6a1f1dSLionel Sambuc vst1.64 {q10-q11}, [r7,:256]! 323*0a6a1f1dSLionel Sambuc vmull.u32 q8,d28,d1[0] 324*0a6a1f1dSLionel Sambuc vmull.u32 q9,d28,d1[1] 325*0a6a1f1dSLionel Sambuc vst1.64 {q12-q13}, [r7,:256]! 326*0a6a1f1dSLionel Sambuc 327*0a6a1f1dSLionel Sambuc vmull.u32 q10,d28,d2[0] 328*0a6a1f1dSLionel Sambuc vmull.u32 q11,d28,d2[1] 329*0a6a1f1dSLionel Sambuc vmull.u32 q12,d28,d3[0] 330*0a6a1f1dSLionel Sambuc vmull.u32 q13,d28,d3[1] 331*0a6a1f1dSLionel Sambuc 332*0a6a1f1dSLionel Sambuc bne .LNEON_1st 333*0a6a1f1dSLionel Sambuc 334*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d29,d4[0] 335*0a6a1f1dSLionel Sambuc add r6,sp,#16 336*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d29,d4[1] 337*0a6a1f1dSLionel Sambuc sub r1,r1,r5,lsl#2 @ rewind r1 338*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d29,d5[0] 339*0a6a1f1dSLionel Sambuc vld1.64 {q5}, [sp,:128] 340*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d29,d5[1] 341*0a6a1f1dSLionel Sambuc sub r9,r5,#1 342*0a6a1f1dSLionel Sambuc 343*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d29,d6[0] 344*0a6a1f1dSLionel Sambuc vst1.64 {q6-q7}, [r7,:256]! 345*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d29,d6[1] 346*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 347*0a6a1f1dSLionel Sambuc vld1.64 {q6}, [r6, :128]! 348*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d29,d7[0] 349*0a6a1f1dSLionel Sambuc vst1.64 {q8-q9}, [r7,:256]! 350*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d29,d7[1] 351*0a6a1f1dSLionel Sambuc 352*0a6a1f1dSLionel Sambuc vst1.64 {q10-q11}, [r7,:256]! 353*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d11 354*0a6a1f1dSLionel Sambuc veor q4,q4,q4 355*0a6a1f1dSLionel Sambuc vst1.64 {q12-q13}, [r7,:256]! 356*0a6a1f1dSLionel Sambuc vld1.64 {q7-q8}, [r6, :256]! 357*0a6a1f1dSLionel Sambuc vst1.64 {q4}, [r7,:128] 358*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 359*0a6a1f1dSLionel Sambuc 360*0a6a1f1dSLionel Sambuc b .LNEON_outer 361*0a6a1f1dSLionel Sambuc 362*0a6a1f1dSLionel Sambuc.align 4 363*0a6a1f1dSLionel Sambuc.LNEON_outer: 364*0a6a1f1dSLionel Sambuc vld1.32 {d28[0]}, [r2,:32]! 365*0a6a1f1dSLionel Sambuc sub r3,r3,r5,lsl#2 @ rewind r3 366*0a6a1f1dSLionel Sambuc vld1.32 {d0-d3}, [r1]! 367*0a6a1f1dSLionel Sambuc veor d8,d8,d8 368*0a6a1f1dSLionel Sambuc mov r7,sp 369*0a6a1f1dSLionel Sambuc vzip.16 d28,d8 370*0a6a1f1dSLionel Sambuc sub r8,r5,#8 371*0a6a1f1dSLionel Sambuc vadd.u64 d12,d12,d10 372*0a6a1f1dSLionel Sambuc 373*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d28,d0[0] 374*0a6a1f1dSLionel Sambuc vld1.64 {q9-q10},[r6,:256]! 375*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d28,d0[1] 376*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d28,d1[0] 377*0a6a1f1dSLionel Sambuc vld1.64 {q11-q12},[r6,:256]! 378*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d28,d1[1] 379*0a6a1f1dSLionel Sambuc 380*0a6a1f1dSLionel Sambuc vshl.i64 d10,d13,#16 381*0a6a1f1dSLionel Sambuc veor d8,d8,d8 382*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d12 383*0a6a1f1dSLionel Sambuc vld1.64 {q13},[r6,:128]! 384*0a6a1f1dSLionel Sambuc vmul.u32 d29,d10,d30 385*0a6a1f1dSLionel Sambuc 386*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d28,d2[0] 387*0a6a1f1dSLionel Sambuc vld1.32 {d4-d7}, [r3]! 388*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d28,d2[1] 389*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d28,d3[0] 390*0a6a1f1dSLionel Sambuc vzip.16 d29,d8 391*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d28,d3[1] 392*0a6a1f1dSLionel Sambuc 393*0a6a1f1dSLionel Sambuc.LNEON_inner: 394*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d29,d4[0] 395*0a6a1f1dSLionel Sambuc vld1.32 {d0-d3}, [r1]! 396*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d29,d4[1] 397*0a6a1f1dSLionel Sambuc subs r8,r8,#8 398*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d29,d5[0] 399*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d29,d5[1] 400*0a6a1f1dSLionel Sambuc vst1.64 {q6-q7}, [r7,:256]! 401*0a6a1f1dSLionel Sambuc 402*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d29,d6[0] 403*0a6a1f1dSLionel Sambuc vld1.64 {q6}, [r6, :128]! 404*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d29,d6[1] 405*0a6a1f1dSLionel Sambuc vst1.64 {q8-q9}, [r7,:256]! 406*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d29,d7[0] 407*0a6a1f1dSLionel Sambuc vld1.64 {q7-q8}, [r6, :256]! 408*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d29,d7[1] 409*0a6a1f1dSLionel Sambuc vst1.64 {q10-q11}, [r7,:256]! 410*0a6a1f1dSLionel Sambuc 411*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d28,d0[0] 412*0a6a1f1dSLionel Sambuc vld1.64 {q9-q10}, [r6, :256]! 413*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d28,d0[1] 414*0a6a1f1dSLionel Sambuc vst1.64 {q12-q13}, [r7,:256]! 415*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d28,d1[0] 416*0a6a1f1dSLionel Sambuc vld1.64 {q11-q12}, [r6, :256]! 417*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d28,d1[1] 418*0a6a1f1dSLionel Sambuc vld1.32 {d4-d7}, [r3]! 419*0a6a1f1dSLionel Sambuc 420*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d28,d2[0] 421*0a6a1f1dSLionel Sambuc vld1.64 {q13}, [r6, :128]! 422*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d28,d2[1] 423*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d28,d3[0] 424*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d28,d3[1] 425*0a6a1f1dSLionel Sambuc 426*0a6a1f1dSLionel Sambuc bne .LNEON_inner 427*0a6a1f1dSLionel Sambuc 428*0a6a1f1dSLionel Sambuc vmlal.u32 q6,d29,d4[0] 429*0a6a1f1dSLionel Sambuc add r6,sp,#16 430*0a6a1f1dSLionel Sambuc vmlal.u32 q7,d29,d4[1] 431*0a6a1f1dSLionel Sambuc sub r1,r1,r5,lsl#2 @ rewind r1 432*0a6a1f1dSLionel Sambuc vmlal.u32 q8,d29,d5[0] 433*0a6a1f1dSLionel Sambuc vld1.64 {q5}, [sp,:128] 434*0a6a1f1dSLionel Sambuc vmlal.u32 q9,d29,d5[1] 435*0a6a1f1dSLionel Sambuc subs r9,r9,#1 436*0a6a1f1dSLionel Sambuc 437*0a6a1f1dSLionel Sambuc vmlal.u32 q10,d29,d6[0] 438*0a6a1f1dSLionel Sambuc vst1.64 {q6-q7}, [r7,:256]! 439*0a6a1f1dSLionel Sambuc vmlal.u32 q11,d29,d6[1] 440*0a6a1f1dSLionel Sambuc vld1.64 {q6}, [r6, :128]! 441*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 442*0a6a1f1dSLionel Sambuc vst1.64 {q8-q9}, [r7,:256]! 443*0a6a1f1dSLionel Sambuc vmlal.u32 q12,d29,d7[0] 444*0a6a1f1dSLionel Sambuc vld1.64 {q7-q8}, [r6, :256]! 445*0a6a1f1dSLionel Sambuc vmlal.u32 q13,d29,d7[1] 446*0a6a1f1dSLionel Sambuc 447*0a6a1f1dSLionel Sambuc vst1.64 {q10-q11}, [r7,:256]! 448*0a6a1f1dSLionel Sambuc vadd.u64 d10,d10,d11 449*0a6a1f1dSLionel Sambuc vst1.64 {q12-q13}, [r7,:256]! 450*0a6a1f1dSLionel Sambuc vshr.u64 d10,d10,#16 451*0a6a1f1dSLionel Sambuc 452*0a6a1f1dSLionel Sambuc bne .LNEON_outer 453*0a6a1f1dSLionel Sambuc 454*0a6a1f1dSLionel Sambuc mov r7,sp 455*0a6a1f1dSLionel Sambuc mov r8,r5 456*0a6a1f1dSLionel Sambuc 457*0a6a1f1dSLionel Sambuc.LNEON_tail: 458*0a6a1f1dSLionel Sambuc vadd.u64 d12,d12,d10 459*0a6a1f1dSLionel Sambuc vld1.64 {q9-q10}, [r6, :256]! 460*0a6a1f1dSLionel Sambuc vshr.u64 d10,d12,#16 461*0a6a1f1dSLionel Sambuc vadd.u64 d13,d13,d10 462*0a6a1f1dSLionel Sambuc vld1.64 {q11-q12}, [r6, :256]! 463*0a6a1f1dSLionel Sambuc vshr.u64 d10,d13,#16 464*0a6a1f1dSLionel Sambuc vld1.64 {q13}, [r6, :128]! 465*0a6a1f1dSLionel Sambuc vzip.16 d12,d13 466*0a6a1f1dSLionel Sambuc 467*0a6a1f1dSLionel Sambuc.LNEON_tail2: 468*0a6a1f1dSLionel Sambuc vadd.u64 d14,d14,d10 469*0a6a1f1dSLionel Sambuc vst1.32 {d12[0]}, [r7, :32]! 470*0a6a1f1dSLionel Sambuc vshr.u64 d10,d14,#16 471*0a6a1f1dSLionel Sambuc vadd.u64 d15,d15,d10 472*0a6a1f1dSLionel Sambuc vshr.u64 d10,d15,#16 473*0a6a1f1dSLionel Sambuc vzip.16 d14,d15 474*0a6a1f1dSLionel Sambuc 475*0a6a1f1dSLionel Sambuc vadd.u64 d16,d16,d10 476*0a6a1f1dSLionel Sambuc vst1.32 {d14[0]}, [r7, :32]! 477*0a6a1f1dSLionel Sambuc vshr.u64 d10,d16,#16 478*0a6a1f1dSLionel Sambuc vadd.u64 d17,d17,d10 479*0a6a1f1dSLionel Sambuc vshr.u64 d10,d17,#16 480*0a6a1f1dSLionel Sambuc vzip.16 d16,d17 481*0a6a1f1dSLionel Sambuc 482*0a6a1f1dSLionel Sambuc vadd.u64 d18,d18,d10 483*0a6a1f1dSLionel Sambuc vst1.32 {d16[0]}, [r7, :32]! 484*0a6a1f1dSLionel Sambuc vshr.u64 d10,d18,#16 485*0a6a1f1dSLionel Sambuc vadd.u64 d19,d19,d10 486*0a6a1f1dSLionel Sambuc vshr.u64 d10,d19,#16 487*0a6a1f1dSLionel Sambuc vzip.16 d18,d19 488*0a6a1f1dSLionel Sambuc 489*0a6a1f1dSLionel Sambuc vadd.u64 d20,d20,d10 490*0a6a1f1dSLionel Sambuc vst1.32 {d18[0]}, [r7, :32]! 491*0a6a1f1dSLionel Sambuc vshr.u64 d10,d20,#16 492*0a6a1f1dSLionel Sambuc vadd.u64 d21,d21,d10 493*0a6a1f1dSLionel Sambuc vshr.u64 d10,d21,#16 494*0a6a1f1dSLionel Sambuc vzip.16 d20,d21 495*0a6a1f1dSLionel Sambuc 496*0a6a1f1dSLionel Sambuc vadd.u64 d22,d22,d10 497*0a6a1f1dSLionel Sambuc vst1.32 {d20[0]}, [r7, :32]! 498*0a6a1f1dSLionel Sambuc vshr.u64 d10,d22,#16 499*0a6a1f1dSLionel Sambuc vadd.u64 d23,d23,d10 500*0a6a1f1dSLionel Sambuc vshr.u64 d10,d23,#16 501*0a6a1f1dSLionel Sambuc vzip.16 d22,d23 502*0a6a1f1dSLionel Sambuc 503*0a6a1f1dSLionel Sambuc vadd.u64 d24,d24,d10 504*0a6a1f1dSLionel Sambuc vst1.32 {d22[0]}, [r7, :32]! 505*0a6a1f1dSLionel Sambuc vshr.u64 d10,d24,#16 506*0a6a1f1dSLionel Sambuc vadd.u64 d25,d25,d10 507*0a6a1f1dSLionel Sambuc vld1.64 {q6}, [r6, :128]! 508*0a6a1f1dSLionel Sambuc vshr.u64 d10,d25,#16 509*0a6a1f1dSLionel Sambuc vzip.16 d24,d25 510*0a6a1f1dSLionel Sambuc 511*0a6a1f1dSLionel Sambuc vadd.u64 d26,d26,d10 512*0a6a1f1dSLionel Sambuc vst1.32 {d24[0]}, [r7, :32]! 513*0a6a1f1dSLionel Sambuc vshr.u64 d10,d26,#16 514*0a6a1f1dSLionel Sambuc vadd.u64 d27,d27,d10 515*0a6a1f1dSLionel Sambuc vld1.64 {q7-q8}, [r6, :256]! 516*0a6a1f1dSLionel Sambuc vshr.u64 d10,d27,#16 517*0a6a1f1dSLionel Sambuc vzip.16 d26,d27 518*0a6a1f1dSLionel Sambuc subs r8,r8,#8 519*0a6a1f1dSLionel Sambuc vst1.32 {d26[0]}, [r7, :32]! 520*0a6a1f1dSLionel Sambuc 521*0a6a1f1dSLionel Sambuc bne .LNEON_tail 522*0a6a1f1dSLionel Sambuc 523*0a6a1f1dSLionel Sambuc vst1.32 {d10[0]}, [r7, :32] @ top-most bit 524*0a6a1f1dSLionel Sambuc sub r3,r3,r5,lsl#2 @ rewind r3 525*0a6a1f1dSLionel Sambuc subs r1,sp,#0 @ clear carry flag 526*0a6a1f1dSLionel Sambuc add r2,sp,r5,lsl#2 527*0a6a1f1dSLionel Sambuc 528*0a6a1f1dSLionel Sambuc.LNEON_sub: 529*0a6a1f1dSLionel Sambuc ldmia r1!, {r4-r7} 530*0a6a1f1dSLionel Sambuc ldmia r3!, {r8-r11} 531*0a6a1f1dSLionel Sambuc sbcs r8, r4,r8 532*0a6a1f1dSLionel Sambuc sbcs r9, r5,r9 533*0a6a1f1dSLionel Sambuc sbcs r10,r6,r10 534*0a6a1f1dSLionel Sambuc sbcs r11,r7,r11 535*0a6a1f1dSLionel Sambuc teq r1,r2 @ preserves carry 536*0a6a1f1dSLionel Sambuc stmia r0!, {r8-r11} 537*0a6a1f1dSLionel Sambuc bne .LNEON_sub 538*0a6a1f1dSLionel Sambuc 539*0a6a1f1dSLionel Sambuc ldr r10, [r1] @ load top-most bit 540*0a6a1f1dSLionel Sambuc veor q0,q0,q0 541*0a6a1f1dSLionel Sambuc sub r11,r2,sp @ this is num*4 542*0a6a1f1dSLionel Sambuc veor q1,q1,q1 543*0a6a1f1dSLionel Sambuc mov r1,sp 544*0a6a1f1dSLionel Sambuc sub r0,r0,r11 @ rewind r0 545*0a6a1f1dSLionel Sambuc mov r3,r2 @ second 3/4th of frame 546*0a6a1f1dSLionel Sambuc sbcs r10,r10,#0 @ result is carry flag 547*0a6a1f1dSLionel Sambuc 548*0a6a1f1dSLionel Sambuc.LNEON_copy_n_zap: 549*0a6a1f1dSLionel Sambuc ldmia r1!, {r4-r7} 550*0a6a1f1dSLionel Sambuc ldmia r0, {r8-r11} 551*0a6a1f1dSLionel Sambuc movcc r8, r4 552*0a6a1f1dSLionel Sambuc vst1.64 {q0-q1}, [r3,:256]! @ wipe 553*0a6a1f1dSLionel Sambuc movcc r9, r5 554*0a6a1f1dSLionel Sambuc movcc r10,r6 555*0a6a1f1dSLionel Sambuc vst1.64 {q0-q1}, [r3,:256]! @ wipe 556*0a6a1f1dSLionel Sambuc movcc r11,r7 557*0a6a1f1dSLionel Sambuc ldmia r1, {r4-r7} 558*0a6a1f1dSLionel Sambuc stmia r0!, {r8-r11} 559*0a6a1f1dSLionel Sambuc sub r1,r1,#16 560*0a6a1f1dSLionel Sambuc ldmia r0, {r8-r11} 561*0a6a1f1dSLionel Sambuc movcc r8, r4 562*0a6a1f1dSLionel Sambuc vst1.64 {q0-q1}, [r1,:256]! @ wipe 563*0a6a1f1dSLionel Sambuc movcc r9, r5 564*0a6a1f1dSLionel Sambuc movcc r10,r6 565*0a6a1f1dSLionel Sambuc vst1.64 {q0-q1}, [r3,:256]! @ wipe 566*0a6a1f1dSLionel Sambuc movcc r11,r7 567*0a6a1f1dSLionel Sambuc teq r1,r2 @ preserves carry 568*0a6a1f1dSLionel Sambuc stmia r0!, {r8-r11} 569*0a6a1f1dSLionel Sambuc bne .LNEON_copy_n_zap 570*0a6a1f1dSLionel Sambuc 571*0a6a1f1dSLionel Sambuc sub sp,ip,#96 572*0a6a1f1dSLionel Sambuc vldmia sp!,{d8-d15} 573*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r11} 574*0a6a1f1dSLionel Sambuc RET @ .word 0xe12fff1e 575*0a6a1f1dSLionel Sambuc.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 576*0a6a1f1dSLionel Sambuc#endif 577*0a6a1f1dSLionel Sambuc.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" 578*0a6a1f1dSLionel Sambuc.align 2 579*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 580*0a6a1f1dSLionel Sambuc.comm OPENSSL_armcap_P,4,4 581*0a6a1f1dSLionel Sambuc#endif 582