1*0a6a1f1dSLionel Sambuc#include "arm_arch.h" 2*0a6a1f1dSLionel Sambuc#include "arm_asm.h" 3*0a6a1f1dSLionel Sambuc 4*0a6a1f1dSLionel Sambuc.text 5*0a6a1f1dSLionel Sambuc.code 32 6*0a6a1f1dSLionel Sambuc.type mul_1x1_ialu,%function 7*0a6a1f1dSLionel Sambuc.align 5 8*0a6a1f1dSLionel Sambucmul_1x1_ialu: 9*0a6a1f1dSLionel Sambuc mov r4,#0 10*0a6a1f1dSLionel Sambuc bic r5,r1,#3<<30 @ a1=a&0x3fffffff 11*0a6a1f1dSLionel Sambuc str r4,[sp,#0] @ tab[0]=0 12*0a6a1f1dSLionel Sambuc add r6,r5,r5 @ a2=a1<<1 13*0a6a1f1dSLionel Sambuc str r5,[sp,#4] @ tab[1]=a1 14*0a6a1f1dSLionel Sambuc eor r7,r5,r6 @ a1^a2 15*0a6a1f1dSLionel Sambuc str r6,[sp,#8] @ tab[2]=a2 16*0a6a1f1dSLionel Sambuc mov r8,r5,lsl#2 @ a4=a1<<2 17*0a6a1f1dSLionel Sambuc str r7,[sp,#12] @ tab[3]=a1^a2 18*0a6a1f1dSLionel Sambuc eor r9,r5,r8 @ a1^a4 19*0a6a1f1dSLionel Sambuc str r8,[sp,#16] @ tab[4]=a4 20*0a6a1f1dSLionel Sambuc eor r4,r6,r8 @ a2^a4 21*0a6a1f1dSLionel Sambuc str r9,[sp,#20] @ tab[5]=a1^a4 22*0a6a1f1dSLionel Sambuc eor r7,r7,r8 @ a1^a2^a4 23*0a6a1f1dSLionel Sambuc str r4,[sp,#24] @ tab[6]=a2^a4 24*0a6a1f1dSLionel Sambuc and r8,r12,r0,lsl#2 25*0a6a1f1dSLionel Sambuc str r7,[sp,#28] @ tab[7]=a1^a2^a4 26*0a6a1f1dSLionel Sambuc 27*0a6a1f1dSLionel Sambuc and r9,r12,r0,lsr#1 28*0a6a1f1dSLionel Sambuc ldr r5,[sp,r8] @ tab[b & 0x7] 29*0a6a1f1dSLionel Sambuc and r8,r12,r0,lsr#4 30*0a6a1f1dSLionel Sambuc ldr r7,[sp,r9] @ tab[b >> 3 & 0x7] 31*0a6a1f1dSLionel Sambuc and r9,r12,r0,lsr#7 32*0a6a1f1dSLionel Sambuc ldr r6,[sp,r8] @ tab[b >> 6 & 0x7] 33*0a6a1f1dSLionel Sambuc eor r5,r5,r7,lsl#3 @ stall 34*0a6a1f1dSLionel Sambuc mov r4,r7,lsr#29 35*0a6a1f1dSLionel Sambuc ldr r7,[sp,r9] @ tab[b >> 9 & 0x7] 36*0a6a1f1dSLionel Sambuc 37*0a6a1f1dSLionel Sambuc and r8,r12,r0,lsr#10 38*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#6 39*0a6a1f1dSLionel Sambuc eor r4,r4,r6,lsr#26 40*0a6a1f1dSLionel Sambuc ldr r6,[sp,r8] @ tab[b >> 12 & 0x7] 41*0a6a1f1dSLionel Sambuc 42*0a6a1f1dSLionel Sambuc and r9,r12,r0,lsr#13 43*0a6a1f1dSLionel Sambuc eor r5,r5,r7,lsl#9 44*0a6a1f1dSLionel Sambuc eor r4,r4,r7,lsr#23 45*0a6a1f1dSLionel Sambuc ldr r7,[sp,r9] @ tab[b >> 15 & 0x7] 46*0a6a1f1dSLionel Sambuc 47*0a6a1f1dSLionel Sambuc and r8,r12,r0,lsr#16 48*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#12 49*0a6a1f1dSLionel Sambuc eor r4,r4,r6,lsr#20 50*0a6a1f1dSLionel Sambuc ldr r6,[sp,r8] @ tab[b >> 18 & 0x7] 51*0a6a1f1dSLionel Sambuc 52*0a6a1f1dSLionel Sambuc and r9,r12,r0,lsr#19 53*0a6a1f1dSLionel Sambuc eor r5,r5,r7,lsl#15 54*0a6a1f1dSLionel Sambuc eor r4,r4,r7,lsr#17 55*0a6a1f1dSLionel Sambuc ldr r7,[sp,r9] @ tab[b >> 21 & 0x7] 56*0a6a1f1dSLionel Sambuc 57*0a6a1f1dSLionel Sambuc and r8,r12,r0,lsr#22 58*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#18 59*0a6a1f1dSLionel Sambuc eor r4,r4,r6,lsr#14 60*0a6a1f1dSLionel Sambuc ldr r6,[sp,r8] @ tab[b >> 24 & 0x7] 61*0a6a1f1dSLionel Sambuc 62*0a6a1f1dSLionel Sambuc and r9,r12,r0,lsr#25 63*0a6a1f1dSLionel Sambuc eor r5,r5,r7,lsl#21 64*0a6a1f1dSLionel Sambuc eor r4,r4,r7,lsr#11 65*0a6a1f1dSLionel Sambuc ldr r7,[sp,r9] @ tab[b >> 27 & 0x7] 66*0a6a1f1dSLionel Sambuc 67*0a6a1f1dSLionel Sambuc tst r1,#1<<30 68*0a6a1f1dSLionel Sambuc and r8,r12,r0,lsr#28 69*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#24 70*0a6a1f1dSLionel Sambuc eor r4,r4,r6,lsr#8 71*0a6a1f1dSLionel Sambuc ldr r6,[sp,r8] @ tab[b >> 30 ] 72*0a6a1f1dSLionel Sambuc 73*0a6a1f1dSLionel Sambuc eorne r5,r5,r0,lsl#30 74*0a6a1f1dSLionel Sambuc eorne r4,r4,r0,lsr#2 75*0a6a1f1dSLionel Sambuc tst r1,#1<<31 76*0a6a1f1dSLionel Sambuc eor r5,r5,r7,lsl#27 77*0a6a1f1dSLionel Sambuc eor r4,r4,r7,lsr#5 78*0a6a1f1dSLionel Sambuc eorne r5,r5,r0,lsl#31 79*0a6a1f1dSLionel Sambuc eorne r4,r4,r0,lsr#1 80*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#30 81*0a6a1f1dSLionel Sambuc eor r4,r4,r6,lsr#2 82*0a6a1f1dSLionel Sambuc 83*0a6a1f1dSLionel Sambuc mov pc,lr 84*0a6a1f1dSLionel Sambuc.size mul_1x1_ialu,.-mul_1x1_ialu 85*0a6a1f1dSLionel Sambuc.global bn_GF2m_mul_2x2 86*0a6a1f1dSLionel Sambuc.type bn_GF2m_mul_2x2,%function 87*0a6a1f1dSLionel Sambuc.align 5 88*0a6a1f1dSLionel Sambucbn_GF2m_mul_2x2: 89*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 90*0a6a1f1dSLionel Sambuc ldr r12,.LOPENSSL_armcap 91*0a6a1f1dSLionel Sambuc.Lpic: ldr r12,[pc,r12] 92*0a6a1f1dSLionel Sambuc tst r12,#1 93*0a6a1f1dSLionel Sambuc bne .LNEON 94*0a6a1f1dSLionel Sambuc#endif 95*0a6a1f1dSLionel Sambuc stmdb sp!,{r4-r10,lr} 96*0a6a1f1dSLionel Sambuc mov r10,r0 @ reassign 1st argument 97*0a6a1f1dSLionel Sambuc mov r0,r3 @ r0=b1 98*0a6a1f1dSLionel Sambuc ldr r3,[sp,#32] @ load b0 99*0a6a1f1dSLionel Sambuc mov r12,#7<<2 100*0a6a1f1dSLionel Sambuc sub sp,sp,#32 @ allocate tab[8] 101*0a6a1f1dSLionel Sambuc 102*0a6a1f1dSLionel Sambuc bl mul_1x1_ialu @ a1�b1 103*0a6a1f1dSLionel Sambuc str r5,[r10,#8] 104*0a6a1f1dSLionel Sambuc str r4,[r10,#12] 105*0a6a1f1dSLionel Sambuc 106*0a6a1f1dSLionel Sambuc eor r0,r0,r3 @ flip b0 and b1 107*0a6a1f1dSLionel Sambuc eor r1,r1,r2 @ flip a0 and a1 108*0a6a1f1dSLionel Sambuc eor r3,r3,r0 109*0a6a1f1dSLionel Sambuc eor r2,r2,r1 110*0a6a1f1dSLionel Sambuc eor r0,r0,r3 111*0a6a1f1dSLionel Sambuc eor r1,r1,r2 112*0a6a1f1dSLionel Sambuc bl mul_1x1_ialu @ a0�b0 113*0a6a1f1dSLionel Sambuc str r5,[r10] 114*0a6a1f1dSLionel Sambuc str r4,[r10,#4] 115*0a6a1f1dSLionel Sambuc 116*0a6a1f1dSLionel Sambuc eor r1,r1,r2 117*0a6a1f1dSLionel Sambuc eor r0,r0,r3 118*0a6a1f1dSLionel Sambuc bl mul_1x1_ialu @ (a1+a0)�(b1+b0) 119*0a6a1f1dSLionel Sambuc ldmia r10,{r6-r9} 120*0a6a1f1dSLionel Sambuc eor r5,r5,r4 121*0a6a1f1dSLionel Sambuc eor r4,r4,r7 122*0a6a1f1dSLionel Sambuc eor r5,r5,r6 123*0a6a1f1dSLionel Sambuc eor r4,r4,r8 124*0a6a1f1dSLionel Sambuc eor r5,r5,r9 125*0a6a1f1dSLionel Sambuc eor r4,r4,r9 126*0a6a1f1dSLionel Sambuc str r4,[r10,#8] 127*0a6a1f1dSLionel Sambuc eor r5,r5,r4 128*0a6a1f1dSLionel Sambuc add sp,sp,#32 @ destroy tab[8] 129*0a6a1f1dSLionel Sambuc str r5,[r10,#4] 130*0a6a1f1dSLionel Sambuc 131*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=5 132*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r10,pc} 133*0a6a1f1dSLionel Sambuc#else 134*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r10,lr} 135*0a6a1f1dSLionel Sambuc tst lr,#1 136*0a6a1f1dSLionel Sambuc moveq pc,lr @ be binary compatible with V4, yet 137*0a6a1f1dSLionel Sambuc .word 0xe12fff1e @ interoperable with Thumb ISA:-) 138*0a6a1f1dSLionel Sambuc#endif 139*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 140*0a6a1f1dSLionel Sambuc.arch armv7-a 141*0a6a1f1dSLionel Sambuc.fpu neon 142*0a6a1f1dSLionel Sambuc 143*0a6a1f1dSLionel Sambuc.align 5 144*0a6a1f1dSLionel Sambuc.LNEON: 145*0a6a1f1dSLionel Sambuc ldr r12, [sp] @ 5th argument 146*0a6a1f1dSLionel Sambuc vmov d26, r2, r1 147*0a6a1f1dSLionel Sambuc vmov d27, r12, r3 148*0a6a1f1dSLionel Sambuc vmov.i64 d28, #0x0000ffffffffffff 149*0a6a1f1dSLionel Sambuc vmov.i64 d29, #0x00000000ffffffff 150*0a6a1f1dSLionel Sambuc vmov.i64 d30, #0x000000000000ffff 151*0a6a1f1dSLionel Sambuc 152*0a6a1f1dSLionel Sambuc vext.8 d2, d26, d26, #1 @ A1 153*0a6a1f1dSLionel Sambuc vmull.p8 q1, d2, d27 @ F = A1*B 154*0a6a1f1dSLionel Sambuc vext.8 d0, d27, d27, #1 @ B1 155*0a6a1f1dSLionel Sambuc vmull.p8 q0, d26, d0 @ E = A*B1 156*0a6a1f1dSLionel Sambuc vext.8 d4, d26, d26, #2 @ A2 157*0a6a1f1dSLionel Sambuc vmull.p8 q2, d4, d27 @ H = A2*B 158*0a6a1f1dSLionel Sambuc vext.8 d16, d27, d27, #2 @ B2 159*0a6a1f1dSLionel Sambuc vmull.p8 q8, d26, d16 @ G = A*B2 160*0a6a1f1dSLionel Sambuc vext.8 d6, d26, d26, #3 @ A3 161*0a6a1f1dSLionel Sambuc veor q1, q1, q0 @ L = E + F 162*0a6a1f1dSLionel Sambuc vmull.p8 q3, d6, d27 @ J = A3*B 163*0a6a1f1dSLionel Sambuc vext.8 d0, d27, d27, #3 @ B3 164*0a6a1f1dSLionel Sambuc veor q2, q2, q8 @ M = G + H 165*0a6a1f1dSLionel Sambuc vmull.p8 q0, d26, d0 @ I = A*B3 166*0a6a1f1dSLionel Sambuc veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8 167*0a6a1f1dSLionel Sambuc vand d3, d3, d28 168*0a6a1f1dSLionel Sambuc vext.8 d16, d27, d27, #4 @ B4 169*0a6a1f1dSLionel Sambuc veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16 170*0a6a1f1dSLionel Sambuc vand d5, d5, d29 171*0a6a1f1dSLionel Sambuc vmull.p8 q8, d26, d16 @ K = A*B4 172*0a6a1f1dSLionel Sambuc veor q3, q3, q0 @ N = I + J 173*0a6a1f1dSLionel Sambuc veor d2, d2, d3 174*0a6a1f1dSLionel Sambuc veor d4, d4, d5 175*0a6a1f1dSLionel Sambuc veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24 176*0a6a1f1dSLionel Sambuc vand d7, d7, d30 177*0a6a1f1dSLionel Sambuc vext.8 q1, q1, q1, #15 178*0a6a1f1dSLionel Sambuc veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32 179*0a6a1f1dSLionel Sambuc vmov.i64 d17, #0 180*0a6a1f1dSLionel Sambuc vext.8 q2, q2, q2, #14 181*0a6a1f1dSLionel Sambuc veor d6, d6, d7 182*0a6a1f1dSLionel Sambuc vmull.p8 q0, d26, d27 @ D = A*B 183*0a6a1f1dSLionel Sambuc vext.8 q8, q8, q8, #12 184*0a6a1f1dSLionel Sambuc vext.8 q3, q3, q3, #13 185*0a6a1f1dSLionel Sambuc veor q1, q1, q2 186*0a6a1f1dSLionel Sambuc veor q3, q3, q8 187*0a6a1f1dSLionel Sambuc veor q0, q0, q1 188*0a6a1f1dSLionel Sambuc veor q0, q0, q3 189*0a6a1f1dSLionel Sambuc 190*0a6a1f1dSLionel Sambuc vst1.32 {q0}, [r0] 191*0a6a1f1dSLionel Sambuc RET @ bx lr 192*0a6a1f1dSLionel Sambuc#endif 193*0a6a1f1dSLionel Sambuc.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 194*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 195*0a6a1f1dSLionel Sambuc.align 5 196*0a6a1f1dSLionel Sambuc.LOPENSSL_armcap: 197*0a6a1f1dSLionel Sambuc.word OPENSSL_armcap_P-(.Lpic+8) 198*0a6a1f1dSLionel Sambuc#endif 199*0a6a1f1dSLionel Sambuc.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" 200*0a6a1f1dSLionel Sambuc.align 5 201*0a6a1f1dSLionel Sambuc 202*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 203*0a6a1f1dSLionel Sambuc.comm OPENSSL_armcap_P,4,4 204*0a6a1f1dSLionel Sambuc#endif 205