1#include "arm_arch.h" 2#include "arm_asm.h" 3 4.text 5.code 32 6.type mul_1x1_ialu,%function 7.align 5 8mul_1x1_ialu: 9 mov r4,#0 10 bic r5,r1,#3<<30 @ a1=a&0x3fffffff 11 str r4,[sp,#0] @ tab[0]=0 12 add r6,r5,r5 @ a2=a1<<1 13 str r5,[sp,#4] @ tab[1]=a1 14 eor r7,r5,r6 @ a1^a2 15 str r6,[sp,#8] @ tab[2]=a2 16 mov r8,r5,lsl#2 @ a4=a1<<2 17 str r7,[sp,#12] @ tab[3]=a1^a2 18 eor r9,r5,r8 @ a1^a4 19 str r8,[sp,#16] @ tab[4]=a4 20 eor r4,r6,r8 @ a2^a4 21 str r9,[sp,#20] @ tab[5]=a1^a4 22 eor r7,r7,r8 @ a1^a2^a4 23 str r4,[sp,#24] @ tab[6]=a2^a4 24 and r8,r12,r0,lsl#2 25 str r7,[sp,#28] @ tab[7]=a1^a2^a4 26 27 and r9,r12,r0,lsr#1 28 ldr r5,[sp,r8] @ tab[b & 0x7] 29 and r8,r12,r0,lsr#4 30 ldr r7,[sp,r9] @ tab[b >> 3 & 0x7] 31 and r9,r12,r0,lsr#7 32 ldr r6,[sp,r8] @ tab[b >> 6 & 0x7] 33 eor r5,r5,r7,lsl#3 @ stall 34 mov r4,r7,lsr#29 35 ldr r7,[sp,r9] @ tab[b >> 9 & 0x7] 36 37 and r8,r12,r0,lsr#10 38 eor r5,r5,r6,lsl#6 39 eor r4,r4,r6,lsr#26 40 ldr r6,[sp,r8] @ tab[b >> 12 & 0x7] 41 42 and r9,r12,r0,lsr#13 43 eor r5,r5,r7,lsl#9 44 eor r4,r4,r7,lsr#23 45 ldr r7,[sp,r9] @ tab[b >> 15 & 0x7] 46 47 and r8,r12,r0,lsr#16 48 eor r5,r5,r6,lsl#12 49 eor r4,r4,r6,lsr#20 50 ldr r6,[sp,r8] @ tab[b >> 18 & 0x7] 51 52 and r9,r12,r0,lsr#19 53 eor r5,r5,r7,lsl#15 54 eor r4,r4,r7,lsr#17 55 ldr r7,[sp,r9] @ tab[b >> 21 & 0x7] 56 57 and r8,r12,r0,lsr#22 58 eor r5,r5,r6,lsl#18 59 eor r4,r4,r6,lsr#14 60 ldr r6,[sp,r8] @ tab[b >> 24 & 0x7] 61 62 and r9,r12,r0,lsr#25 63 eor r5,r5,r7,lsl#21 64 eor r4,r4,r7,lsr#11 65 ldr r7,[sp,r9] @ tab[b >> 27 & 0x7] 66 67 tst r1,#1<<30 68 and r8,r12,r0,lsr#28 69 eor r5,r5,r6,lsl#24 70 eor r4,r4,r6,lsr#8 71 ldr r6,[sp,r8] @ tab[b >> 30 ] 72 73 eorne r5,r5,r0,lsl#30 74 eorne r4,r4,r0,lsr#2 75 tst r1,#1<<31 76 eor r5,r5,r7,lsl#27 77 eor r4,r4,r7,lsr#5 78 eorne r5,r5,r0,lsl#31 79 eorne r4,r4,r0,lsr#1 80 eor r5,r5,r6,lsl#30 81 eor r4,r4,r6,lsr#2 82 83 mov pc,lr 84.size mul_1x1_ialu,.-mul_1x1_ialu 85.global bn_GF2m_mul_2x2 86.type bn_GF2m_mul_2x2,%function 87.align 5 88bn_GF2m_mul_2x2: 89#if __ARM_MAX_ARCH__>=7 90 ldr r12,.LOPENSSL_armcap 91.Lpic: ldr r12,[pc,r12] 92 tst r12,#1 93 bne .LNEON 94#endif 95 stmdb sp!,{r4-r10,lr} 96 mov r10,r0 @ reassign 1st argument 97 mov r0,r3 @ r0=b1 98 ldr r3,[sp,#32] @ load b0 99 mov r12,#7<<2 100 sub sp,sp,#32 @ allocate tab[8] 101 102 bl mul_1x1_ialu @ a1�b1 103 str r5,[r10,#8] 104 str r4,[r10,#12] 105 106 eor r0,r0,r3 @ flip b0 and b1 107 eor r1,r1,r2 @ flip a0 and a1 108 eor r3,r3,r0 109 eor r2,r2,r1 110 eor r0,r0,r3 111 eor r1,r1,r2 112 bl mul_1x1_ialu @ a0�b0 113 str r5,[r10] 114 str r4,[r10,#4] 115 116 eor r1,r1,r2 117 eor r0,r0,r3 118 bl mul_1x1_ialu @ (a1+a0)�(b1+b0) 119 ldmia r10,{r6-r9} 120 eor r5,r5,r4 121 eor r4,r4,r7 122 eor r5,r5,r6 123 eor r4,r4,r8 124 eor r5,r5,r9 125 eor r4,r4,r9 126 str r4,[r10,#8] 127 eor r5,r5,r4 128 add sp,sp,#32 @ destroy tab[8] 129 str r5,[r10,#4] 130 131#if __ARM_ARCH__>=5 132 ldmia sp!,{r4-r10,pc} 133#else 134 ldmia sp!,{r4-r10,lr} 135 tst lr,#1 136 moveq pc,lr @ be binary compatible with V4, yet 137 .word 0xe12fff1e @ interoperable with Thumb ISA:-) 138#endif 139#if __ARM_MAX_ARCH__>=7 140.arch armv7-a 141.fpu neon 142 143.align 5 144.LNEON: 145 ldr r12, [sp] @ 5th argument 146 vmov d26, r2, r1 147 vmov d27, r12, r3 148 vmov.i64 d28, #0x0000ffffffffffff 149 vmov.i64 d29, #0x00000000ffffffff 150 vmov.i64 d30, #0x000000000000ffff 151 152 vext.8 d2, d26, d26, #1 @ A1 153 vmull.p8 q1, d2, d27 @ F = A1*B 154 vext.8 d0, d27, d27, #1 @ B1 155 vmull.p8 q0, d26, d0 @ E = A*B1 156 vext.8 d4, d26, d26, #2 @ A2 157 vmull.p8 q2, d4, d27 @ H = A2*B 158 vext.8 d16, d27, d27, #2 @ B2 159 vmull.p8 q8, d26, d16 @ G = A*B2 160 vext.8 d6, d26, d26, #3 @ A3 161 veor q1, q1, q0 @ L = E + F 162 vmull.p8 q3, d6, d27 @ J = A3*B 163 vext.8 d0, d27, d27, #3 @ B3 164 veor q2, q2, q8 @ M = G + H 165 vmull.p8 q0, d26, d0 @ I = A*B3 166 veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8 167 vand d3, d3, d28 168 vext.8 d16, d27, d27, #4 @ B4 169 veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16 170 vand d5, d5, d29 171 vmull.p8 q8, d26, d16 @ K = A*B4 172 veor q3, q3, q0 @ N = I + J 173 veor d2, d2, d3 174 veor d4, d4, d5 175 veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24 176 vand d7, d7, d30 177 vext.8 q1, q1, q1, #15 178 veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32 179 vmov.i64 d17, #0 180 vext.8 q2, q2, q2, #14 181 veor d6, d6, d7 182 vmull.p8 q0, d26, d27 @ D = A*B 183 vext.8 q8, q8, q8, #12 184 vext.8 q3, q3, q3, #13 185 veor q1, q1, q2 186 veor q3, q3, q8 187 veor q0, q0, q1 188 veor q0, q0, q3 189 190 vst1.32 {q0}, [r0] 191 RET @ bx lr 192#endif 193.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 194#if __ARM_MAX_ARCH__>=7 195.align 5 196.LOPENSSL_armcap: 197.word OPENSSL_armcap_P-(.Lpic+8) 198#endif 199.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" 200.align 5 201 202#if __ARM_MAX_ARCH__>=7 203.comm OPENSSL_armcap_P,4,4 204#endif 205