1*0a6a1f1dSLionel Sambuc#include "arm_arch.h" 2*0a6a1f1dSLionel Sambuc#include "arm_asm.h" 3*0a6a1f1dSLionel Sambuc 4*0a6a1f1dSLionel Sambuc.syntax unified 5*0a6a1f1dSLionel Sambuc.text 6*0a6a1f1dSLionel Sambuc.code 32 7*0a6a1f1dSLionel Sambuc 8*0a6a1f1dSLionel Sambuc.type rem_4bit,%object 9*0a6a1f1dSLionel Sambuc.align 5 10*0a6a1f1dSLionel Sambucrem_4bit: 11*0a6a1f1dSLionel Sambuc.short 0x0000,0x1C20,0x3840,0x2460 12*0a6a1f1dSLionel Sambuc.short 0x7080,0x6CA0,0x48C0,0x54E0 13*0a6a1f1dSLionel Sambuc.short 0xE100,0xFD20,0xD940,0xC560 14*0a6a1f1dSLionel Sambuc.short 0x9180,0x8DA0,0xA9C0,0xB5E0 15*0a6a1f1dSLionel Sambuc.size rem_4bit,.-rem_4bit 16*0a6a1f1dSLionel Sambuc 17*0a6a1f1dSLionel Sambuc.type rem_4bit_get,%function 18*0a6a1f1dSLionel Sambucrem_4bit_get: 19*0a6a1f1dSLionel Sambuc sub r2,pc,#8 20*0a6a1f1dSLionel Sambuc sub r2,r2,#32 @ &rem_4bit 21*0a6a1f1dSLionel Sambuc b .Lrem_4bit_got 22*0a6a1f1dSLionel Sambuc nop 23*0a6a1f1dSLionel Sambuc.size rem_4bit_get,.-rem_4bit_get 24*0a6a1f1dSLionel Sambuc 25*0a6a1f1dSLionel Sambuc.global gcm_ghash_4bit 26*0a6a1f1dSLionel Sambuc.type gcm_ghash_4bit,%function 27*0a6a1f1dSLionel Sambucgcm_ghash_4bit: 28*0a6a1f1dSLionel Sambuc sub r12,pc,#8 29*0a6a1f1dSLionel Sambuc add r3,r2,r3 @ r3 to point at the end 30*0a6a1f1dSLionel Sambuc stmdb sp!,{r3-r11,lr} @ save r3/end too 31*0a6a1f1dSLionel Sambuc sub r12,r12,#48 @ &rem_4bit 32*0a6a1f1dSLionel Sambuc 33*0a6a1f1dSLionel Sambuc ldmia r12,{r4-r11} @ copy rem_4bit ... 34*0a6a1f1dSLionel Sambuc stmdb sp!,{r4-r11} @ ... to stack 35*0a6a1f1dSLionel Sambuc 36*0a6a1f1dSLionel Sambuc ldrb r12,[r2,#15] 37*0a6a1f1dSLionel Sambuc ldrb r14,[r0,#15] 38*0a6a1f1dSLionel Sambuc.Louter: 39*0a6a1f1dSLionel Sambuc eor r12,r12,r14 40*0a6a1f1dSLionel Sambuc and r14,r12,#0xf0 41*0a6a1f1dSLionel Sambuc and r12,r12,#0x0f 42*0a6a1f1dSLionel Sambuc mov r3,#14 43*0a6a1f1dSLionel Sambuc 44*0a6a1f1dSLionel Sambuc add r7,r1,r12,lsl#4 45*0a6a1f1dSLionel Sambuc ldmia r7,{r4-r7} @ load Htbl[nlo] 46*0a6a1f1dSLionel Sambuc add r11,r1,r14 47*0a6a1f1dSLionel Sambuc ldrb r12,[r2,#14] 48*0a6a1f1dSLionel Sambuc 49*0a6a1f1dSLionel Sambuc and r14,r4,#0xf @ rem 50*0a6a1f1dSLionel Sambuc ldmia r11,{r8-r11} @ load Htbl[nhi] 51*0a6a1f1dSLionel Sambuc add r14,r14,r14 52*0a6a1f1dSLionel Sambuc eor r4,r8,r4,lsr#4 53*0a6a1f1dSLionel Sambuc ldrh r8,[sp,r14] @ rem_4bit[rem] 54*0a6a1f1dSLionel Sambuc eor r4,r4,r5,lsl#28 55*0a6a1f1dSLionel Sambuc ldrb r14,[r0,#14] 56*0a6a1f1dSLionel Sambuc eor r5,r9,r5,lsr#4 57*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#28 58*0a6a1f1dSLionel Sambuc eor r6,r10,r6,lsr#4 59*0a6a1f1dSLionel Sambuc eor r6,r6,r7,lsl#28 60*0a6a1f1dSLionel Sambuc eor r7,r11,r7,lsr#4 61*0a6a1f1dSLionel Sambuc eor r12,r12,r14 62*0a6a1f1dSLionel Sambuc and r14,r12,#0xf0 63*0a6a1f1dSLionel Sambuc and r12,r12,#0x0f 64*0a6a1f1dSLionel Sambuc eor r7,r7,r8,lsl#16 65*0a6a1f1dSLionel Sambuc 66*0a6a1f1dSLionel Sambuc.Linner: 67*0a6a1f1dSLionel Sambuc add r11,r1,r12,lsl#4 68*0a6a1f1dSLionel Sambuc and r12,r4,#0xf @ rem 69*0a6a1f1dSLionel Sambuc subs r3,r3,#1 70*0a6a1f1dSLionel Sambuc add r12,r12,r12 71*0a6a1f1dSLionel Sambuc ldmia r11,{r8-r11} @ load Htbl[nlo] 72*0a6a1f1dSLionel Sambuc eor r4,r8,r4,lsr#4 73*0a6a1f1dSLionel Sambuc eor r4,r4,r5,lsl#28 74*0a6a1f1dSLionel Sambuc eor r5,r9,r5,lsr#4 75*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#28 76*0a6a1f1dSLionel Sambuc ldrh r8,[sp,r12] @ rem_4bit[rem] 77*0a6a1f1dSLionel Sambuc eor r6,r10,r6,lsr#4 78*0a6a1f1dSLionel Sambuc ldrbpl r12,[r2,r3] 79*0a6a1f1dSLionel Sambuc eor r6,r6,r7,lsl#28 80*0a6a1f1dSLionel Sambuc eor r7,r11,r7,lsr#4 81*0a6a1f1dSLionel Sambuc 82*0a6a1f1dSLionel Sambuc add r11,r1,r14 83*0a6a1f1dSLionel Sambuc and r14,r4,#0xf @ rem 84*0a6a1f1dSLionel Sambuc eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 85*0a6a1f1dSLionel Sambuc add r14,r14,r14 86*0a6a1f1dSLionel Sambuc ldmia r11,{r8-r11} @ load Htbl[nhi] 87*0a6a1f1dSLionel Sambuc eor r4,r8,r4,lsr#4 88*0a6a1f1dSLionel Sambuc ldrbpl r8,[r0,r3] 89*0a6a1f1dSLionel Sambuc eor r4,r4,r5,lsl#28 90*0a6a1f1dSLionel Sambuc eor r5,r9,r5,lsr#4 91*0a6a1f1dSLionel Sambuc ldrh r9,[sp,r14] 92*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#28 93*0a6a1f1dSLionel Sambuc eor r6,r10,r6,lsr#4 94*0a6a1f1dSLionel Sambuc eor r6,r6,r7,lsl#28 95*0a6a1f1dSLionel Sambuc eorpl r12,r12,r8 96*0a6a1f1dSLionel Sambuc eor r7,r11,r7,lsr#4 97*0a6a1f1dSLionel Sambuc andpl r14,r12,#0xf0 98*0a6a1f1dSLionel Sambuc andpl r12,r12,#0x0f 99*0a6a1f1dSLionel Sambuc eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] 100*0a6a1f1dSLionel Sambuc bpl .Linner 101*0a6a1f1dSLionel Sambuc 102*0a6a1f1dSLionel Sambuc ldr r3,[sp,#32] @ re-load r3/end 103*0a6a1f1dSLionel Sambuc add r2,r2,#16 104*0a6a1f1dSLionel Sambuc mov r14,r4 105*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 106*0a6a1f1dSLionel Sambuc rev r4,r4 107*0a6a1f1dSLionel Sambuc str r4,[r0,#12] 108*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 109*0a6a1f1dSLionel Sambuc str r4,[r0,#12] 110*0a6a1f1dSLionel Sambuc#else 111*0a6a1f1dSLionel Sambuc mov r9,r4,lsr#8 112*0a6a1f1dSLionel Sambuc strb r4,[r0,#12+3] 113*0a6a1f1dSLionel Sambuc mov r10,r4,lsr#16 114*0a6a1f1dSLionel Sambuc strb r9,[r0,#12+2] 115*0a6a1f1dSLionel Sambuc mov r11,r4,lsr#24 116*0a6a1f1dSLionel Sambuc strb r10,[r0,#12+1] 117*0a6a1f1dSLionel Sambuc strb r11,[r0,#12] 118*0a6a1f1dSLionel Sambuc#endif 119*0a6a1f1dSLionel Sambuc cmp r2,r3 120*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 121*0a6a1f1dSLionel Sambuc rev r5,r5 122*0a6a1f1dSLionel Sambuc str r5,[r0,#8] 123*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 124*0a6a1f1dSLionel Sambuc str r5,[r0,#8] 125*0a6a1f1dSLionel Sambuc#else 126*0a6a1f1dSLionel Sambuc mov r9,r5,lsr#8 127*0a6a1f1dSLionel Sambuc strb r5,[r0,#8+3] 128*0a6a1f1dSLionel Sambuc mov r10,r5,lsr#16 129*0a6a1f1dSLionel Sambuc strb r9,[r0,#8+2] 130*0a6a1f1dSLionel Sambuc mov r11,r5,lsr#24 131*0a6a1f1dSLionel Sambuc strb r10,[r0,#8+1] 132*0a6a1f1dSLionel Sambuc strb r11,[r0,#8] 133*0a6a1f1dSLionel Sambuc#endif 134*0a6a1f1dSLionel Sambuc ldrbne r12,[r2,#15] 135*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 136*0a6a1f1dSLionel Sambuc rev r6,r6 137*0a6a1f1dSLionel Sambuc str r6,[r0,#4] 138*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 139*0a6a1f1dSLionel Sambuc str r6,[r0,#4] 140*0a6a1f1dSLionel Sambuc#else 141*0a6a1f1dSLionel Sambuc mov r9,r6,lsr#8 142*0a6a1f1dSLionel Sambuc strb r6,[r0,#4+3] 143*0a6a1f1dSLionel Sambuc mov r10,r6,lsr#16 144*0a6a1f1dSLionel Sambuc strb r9,[r0,#4+2] 145*0a6a1f1dSLionel Sambuc mov r11,r6,lsr#24 146*0a6a1f1dSLionel Sambuc strb r10,[r0,#4+1] 147*0a6a1f1dSLionel Sambuc strb r11,[r0,#4] 148*0a6a1f1dSLionel Sambuc#endif 149*0a6a1f1dSLionel Sambuc 150*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 151*0a6a1f1dSLionel Sambuc rev r7,r7 152*0a6a1f1dSLionel Sambuc str r7,[r0,#0] 153*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 154*0a6a1f1dSLionel Sambuc str r7,[r0,#0] 155*0a6a1f1dSLionel Sambuc#else 156*0a6a1f1dSLionel Sambuc mov r9,r7,lsr#8 157*0a6a1f1dSLionel Sambuc strb r7,[r0,#0+3] 158*0a6a1f1dSLionel Sambuc mov r10,r7,lsr#16 159*0a6a1f1dSLionel Sambuc strb r9,[r0,#0+2] 160*0a6a1f1dSLionel Sambuc mov r11,r7,lsr#24 161*0a6a1f1dSLionel Sambuc strb r10,[r0,#0+1] 162*0a6a1f1dSLionel Sambuc strb r11,[r0,#0] 163*0a6a1f1dSLionel Sambuc#endif 164*0a6a1f1dSLionel Sambuc 165*0a6a1f1dSLionel Sambuc bne .Louter 166*0a6a1f1dSLionel Sambuc 167*0a6a1f1dSLionel Sambuc add sp,sp,#36 168*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=5 169*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r11,pc} 170*0a6a1f1dSLionel Sambuc#else 171*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r11,lr} 172*0a6a1f1dSLionel Sambuc tst lr,#1 173*0a6a1f1dSLionel Sambuc moveq pc,lr @ be binary compatible with V4, yet 174*0a6a1f1dSLionel Sambuc .word 0xe12fff1e @ interoperable with Thumb ISA:-) 175*0a6a1f1dSLionel Sambuc#endif 176*0a6a1f1dSLionel Sambuc.size gcm_ghash_4bit,.-gcm_ghash_4bit 177*0a6a1f1dSLionel Sambuc 178*0a6a1f1dSLionel Sambuc.global gcm_gmult_4bit 179*0a6a1f1dSLionel Sambuc.type gcm_gmult_4bit,%function 180*0a6a1f1dSLionel Sambucgcm_gmult_4bit: 181*0a6a1f1dSLionel Sambuc stmdb sp!,{r4-r11,lr} 182*0a6a1f1dSLionel Sambuc ldrb r12,[r0,#15] 183*0a6a1f1dSLionel Sambuc b rem_4bit_get 184*0a6a1f1dSLionel Sambuc.Lrem_4bit_got: 185*0a6a1f1dSLionel Sambuc and r14,r12,#0xf0 186*0a6a1f1dSLionel Sambuc and r12,r12,#0x0f 187*0a6a1f1dSLionel Sambuc mov r3,#14 188*0a6a1f1dSLionel Sambuc 189*0a6a1f1dSLionel Sambuc add r7,r1,r12,lsl#4 190*0a6a1f1dSLionel Sambuc ldmia r7,{r4-r7} @ load Htbl[nlo] 191*0a6a1f1dSLionel Sambuc ldrb r12,[r0,#14] 192*0a6a1f1dSLionel Sambuc 193*0a6a1f1dSLionel Sambuc add r11,r1,r14 194*0a6a1f1dSLionel Sambuc and r14,r4,#0xf @ rem 195*0a6a1f1dSLionel Sambuc ldmia r11,{r8-r11} @ load Htbl[nhi] 196*0a6a1f1dSLionel Sambuc add r14,r14,r14 197*0a6a1f1dSLionel Sambuc eor r4,r8,r4,lsr#4 198*0a6a1f1dSLionel Sambuc ldrh r8,[r2,r14] @ rem_4bit[rem] 199*0a6a1f1dSLionel Sambuc eor r4,r4,r5,lsl#28 200*0a6a1f1dSLionel Sambuc eor r5,r9,r5,lsr#4 201*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#28 202*0a6a1f1dSLionel Sambuc eor r6,r10,r6,lsr#4 203*0a6a1f1dSLionel Sambuc eor r6,r6,r7,lsl#28 204*0a6a1f1dSLionel Sambuc eor r7,r11,r7,lsr#4 205*0a6a1f1dSLionel Sambuc and r14,r12,#0xf0 206*0a6a1f1dSLionel Sambuc eor r7,r7,r8,lsl#16 207*0a6a1f1dSLionel Sambuc and r12,r12,#0x0f 208*0a6a1f1dSLionel Sambuc 209*0a6a1f1dSLionel Sambuc.Loop: 210*0a6a1f1dSLionel Sambuc add r11,r1,r12,lsl#4 211*0a6a1f1dSLionel Sambuc and r12,r4,#0xf @ rem 212*0a6a1f1dSLionel Sambuc subs r3,r3,#1 213*0a6a1f1dSLionel Sambuc add r12,r12,r12 214*0a6a1f1dSLionel Sambuc ldmia r11,{r8-r11} @ load Htbl[nlo] 215*0a6a1f1dSLionel Sambuc eor r4,r8,r4,lsr#4 216*0a6a1f1dSLionel Sambuc eor r4,r4,r5,lsl#28 217*0a6a1f1dSLionel Sambuc eor r5,r9,r5,lsr#4 218*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#28 219*0a6a1f1dSLionel Sambuc ldrh r8,[r2,r12] @ rem_4bit[rem] 220*0a6a1f1dSLionel Sambuc eor r6,r10,r6,lsr#4 221*0a6a1f1dSLionel Sambuc ldrbpl r12,[r0,r3] 222*0a6a1f1dSLionel Sambuc eor r6,r6,r7,lsl#28 223*0a6a1f1dSLionel Sambuc eor r7,r11,r7,lsr#4 224*0a6a1f1dSLionel Sambuc 225*0a6a1f1dSLionel Sambuc add r11,r1,r14 226*0a6a1f1dSLionel Sambuc and r14,r4,#0xf @ rem 227*0a6a1f1dSLionel Sambuc eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 228*0a6a1f1dSLionel Sambuc add r14,r14,r14 229*0a6a1f1dSLionel Sambuc ldmia r11,{r8-r11} @ load Htbl[nhi] 230*0a6a1f1dSLionel Sambuc eor r4,r8,r4,lsr#4 231*0a6a1f1dSLionel Sambuc eor r4,r4,r5,lsl#28 232*0a6a1f1dSLionel Sambuc eor r5,r9,r5,lsr#4 233*0a6a1f1dSLionel Sambuc ldrh r8,[r2,r14] @ rem_4bit[rem] 234*0a6a1f1dSLionel Sambuc eor r5,r5,r6,lsl#28 235*0a6a1f1dSLionel Sambuc eor r6,r10,r6,lsr#4 236*0a6a1f1dSLionel Sambuc eor r6,r6,r7,lsl#28 237*0a6a1f1dSLionel Sambuc eor r7,r11,r7,lsr#4 238*0a6a1f1dSLionel Sambuc andpl r14,r12,#0xf0 239*0a6a1f1dSLionel Sambuc andpl r12,r12,#0x0f 240*0a6a1f1dSLionel Sambuc eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 241*0a6a1f1dSLionel Sambuc bpl .Loop 242*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 243*0a6a1f1dSLionel Sambuc rev r4,r4 244*0a6a1f1dSLionel Sambuc str r4,[r0,#12] 245*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 246*0a6a1f1dSLionel Sambuc str r4,[r0,#12] 247*0a6a1f1dSLionel Sambuc#else 248*0a6a1f1dSLionel Sambuc mov r9,r4,lsr#8 249*0a6a1f1dSLionel Sambuc strb r4,[r0,#12+3] 250*0a6a1f1dSLionel Sambuc mov r10,r4,lsr#16 251*0a6a1f1dSLionel Sambuc strb r9,[r0,#12+2] 252*0a6a1f1dSLionel Sambuc mov r11,r4,lsr#24 253*0a6a1f1dSLionel Sambuc strb r10,[r0,#12+1] 254*0a6a1f1dSLionel Sambuc strb r11,[r0,#12] 255*0a6a1f1dSLionel Sambuc#endif 256*0a6a1f1dSLionel Sambuc 257*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 258*0a6a1f1dSLionel Sambuc rev r5,r5 259*0a6a1f1dSLionel Sambuc str r5,[r0,#8] 260*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 261*0a6a1f1dSLionel Sambuc str r5,[r0,#8] 262*0a6a1f1dSLionel Sambuc#else 263*0a6a1f1dSLionel Sambuc mov r9,r5,lsr#8 264*0a6a1f1dSLionel Sambuc strb r5,[r0,#8+3] 265*0a6a1f1dSLionel Sambuc mov r10,r5,lsr#16 266*0a6a1f1dSLionel Sambuc strb r9,[r0,#8+2] 267*0a6a1f1dSLionel Sambuc mov r11,r5,lsr#24 268*0a6a1f1dSLionel Sambuc strb r10,[r0,#8+1] 269*0a6a1f1dSLionel Sambuc strb r11,[r0,#8] 270*0a6a1f1dSLionel Sambuc#endif 271*0a6a1f1dSLionel Sambuc 272*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 273*0a6a1f1dSLionel Sambuc rev r6,r6 274*0a6a1f1dSLionel Sambuc str r6,[r0,#4] 275*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 276*0a6a1f1dSLionel Sambuc str r6,[r0,#4] 277*0a6a1f1dSLionel Sambuc#else 278*0a6a1f1dSLionel Sambuc mov r9,r6,lsr#8 279*0a6a1f1dSLionel Sambuc strb r6,[r0,#4+3] 280*0a6a1f1dSLionel Sambuc mov r10,r6,lsr#16 281*0a6a1f1dSLionel Sambuc strb r9,[r0,#4+2] 282*0a6a1f1dSLionel Sambuc mov r11,r6,lsr#24 283*0a6a1f1dSLionel Sambuc strb r10,[r0,#4+1] 284*0a6a1f1dSLionel Sambuc strb r11,[r0,#4] 285*0a6a1f1dSLionel Sambuc#endif 286*0a6a1f1dSLionel Sambuc 287*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=7 && defined(__ARMEL__) 288*0a6a1f1dSLionel Sambuc rev r7,r7 289*0a6a1f1dSLionel Sambuc str r7,[r0,#0] 290*0a6a1f1dSLionel Sambuc#elif defined(__ARMEB__) 291*0a6a1f1dSLionel Sambuc str r7,[r0,#0] 292*0a6a1f1dSLionel Sambuc#else 293*0a6a1f1dSLionel Sambuc mov r9,r7,lsr#8 294*0a6a1f1dSLionel Sambuc strb r7,[r0,#0+3] 295*0a6a1f1dSLionel Sambuc mov r10,r7,lsr#16 296*0a6a1f1dSLionel Sambuc strb r9,[r0,#0+2] 297*0a6a1f1dSLionel Sambuc mov r11,r7,lsr#24 298*0a6a1f1dSLionel Sambuc strb r10,[r0,#0+1] 299*0a6a1f1dSLionel Sambuc strb r11,[r0,#0] 300*0a6a1f1dSLionel Sambuc#endif 301*0a6a1f1dSLionel Sambuc 302*0a6a1f1dSLionel Sambuc#if __ARM_ARCH__>=5 303*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r11,pc} 304*0a6a1f1dSLionel Sambuc#else 305*0a6a1f1dSLionel Sambuc ldmia sp!,{r4-r11,lr} 306*0a6a1f1dSLionel Sambuc tst lr,#1 307*0a6a1f1dSLionel Sambuc moveq pc,lr @ be binary compatible with V4, yet 308*0a6a1f1dSLionel Sambuc .word 0xe12fff1e @ interoperable with Thumb ISA:-) 309*0a6a1f1dSLionel Sambuc#endif 310*0a6a1f1dSLionel Sambuc.size gcm_gmult_4bit,.-gcm_gmult_4bit 311*0a6a1f1dSLionel Sambuc#if __ARM_MAX_ARCH__>=7 312*0a6a1f1dSLionel Sambuc.arch armv7-a 313*0a6a1f1dSLionel Sambuc.fpu neon 314*0a6a1f1dSLionel Sambuc 315*0a6a1f1dSLionel Sambuc.global gcm_init_neon 316*0a6a1f1dSLionel Sambuc.type gcm_init_neon,%function 317*0a6a1f1dSLionel Sambuc.align 4 318*0a6a1f1dSLionel Sambucgcm_init_neon: 319*0a6a1f1dSLionel Sambuc vld1.64 d7,[r1,:64]! @ load H 320*0a6a1f1dSLionel Sambuc vmov.i8 q8,#0xe1 321*0a6a1f1dSLionel Sambuc vld1.64 d6,[r1,:64] 322*0a6a1f1dSLionel Sambuc vshl.i64 d17,#57 323*0a6a1f1dSLionel Sambuc vshr.u64 d16,#63 @ t0=0xc2....01 324*0a6a1f1dSLionel Sambuc vdup.8 q9,d7[7] 325*0a6a1f1dSLionel Sambuc vshr.u64 d26,d6,#63 326*0a6a1f1dSLionel Sambuc vshr.s8 q9,#7 @ broadcast carry bit 327*0a6a1f1dSLionel Sambuc vshl.i64 q3,q3,#1 328*0a6a1f1dSLionel Sambuc vand q8,q8,q9 329*0a6a1f1dSLionel Sambuc vorr d7,d26 @ H<<<=1 330*0a6a1f1dSLionel Sambuc veor q3,q3,q8 @ twisted H 331*0a6a1f1dSLionel Sambuc vstmia r0,{q3} 332*0a6a1f1dSLionel Sambuc 333*0a6a1f1dSLionel Sambuc RET @ bx lr 334*0a6a1f1dSLionel Sambuc.size gcm_init_neon,.-gcm_init_neon 335*0a6a1f1dSLionel Sambuc 336*0a6a1f1dSLionel Sambuc.global gcm_gmult_neon 337*0a6a1f1dSLionel Sambuc.type gcm_gmult_neon,%function 338*0a6a1f1dSLionel Sambuc.align 4 339*0a6a1f1dSLionel Sambucgcm_gmult_neon: 340*0a6a1f1dSLionel Sambuc vld1.64 d7,[r0,:64]! @ load Xi 341*0a6a1f1dSLionel Sambuc vld1.64 d6,[r0,:64]! 342*0a6a1f1dSLionel Sambuc vmov.i64 d29,#0x0000ffffffffffff 343*0a6a1f1dSLionel Sambuc vldmia r1,{d26-d27} @ load twisted H 344*0a6a1f1dSLionel Sambuc vmov.i64 d30,#0x00000000ffffffff 345*0a6a1f1dSLionel Sambuc#ifdef __ARMEL__ 346*0a6a1f1dSLionel Sambuc vrev64.8 q3,q3 347*0a6a1f1dSLionel Sambuc#endif 348*0a6a1f1dSLionel Sambuc vmov.i64 d31,#0x000000000000ffff 349*0a6a1f1dSLionel Sambuc veor d28,d26,d27 @ Karatsuba pre-processing 350*0a6a1f1dSLionel Sambuc mov r3,#16 351*0a6a1f1dSLionel Sambuc b .Lgmult_neon 352*0a6a1f1dSLionel Sambuc.size gcm_gmult_neon,.-gcm_gmult_neon 353*0a6a1f1dSLionel Sambuc 354*0a6a1f1dSLionel Sambuc.global gcm_ghash_neon 355*0a6a1f1dSLionel Sambuc.type gcm_ghash_neon,%function 356*0a6a1f1dSLionel Sambuc.align 4 357*0a6a1f1dSLionel Sambucgcm_ghash_neon: 358*0a6a1f1dSLionel Sambuc vld1.64 d1,[r0,:64]! @ load Xi 359*0a6a1f1dSLionel Sambuc vld1.64 d0,[r0,:64]! 360*0a6a1f1dSLionel Sambuc vmov.i64 d29,#0x0000ffffffffffff 361*0a6a1f1dSLionel Sambuc vldmia r1,{d26-d27} @ load twisted H 362*0a6a1f1dSLionel Sambuc vmov.i64 d30,#0x00000000ffffffff 363*0a6a1f1dSLionel Sambuc#ifdef __ARMEL__ 364*0a6a1f1dSLionel Sambuc vrev64.8 q0,q0 365*0a6a1f1dSLionel Sambuc#endif 366*0a6a1f1dSLionel Sambuc vmov.i64 d31,#0x000000000000ffff 367*0a6a1f1dSLionel Sambuc veor d28,d26,d27 @ Karatsuba pre-processing 368*0a6a1f1dSLionel Sambuc 369*0a6a1f1dSLionel Sambuc.Loop_neon: 370*0a6a1f1dSLionel Sambuc vld1.64 d7,[r2]! @ load inp 371*0a6a1f1dSLionel Sambuc vld1.64 d6,[r2]! 372*0a6a1f1dSLionel Sambuc#ifdef __ARMEL__ 373*0a6a1f1dSLionel Sambuc vrev64.8 q3,q3 374*0a6a1f1dSLionel Sambuc#endif 375*0a6a1f1dSLionel Sambuc veor q3,q0 @ inp^=Xi 376*0a6a1f1dSLionel Sambuc.Lgmult_neon: 377*0a6a1f1dSLionel Sambuc vext.8 d16, d26, d26, #1 @ A1 378*0a6a1f1dSLionel Sambuc vmull.p8 q8, d16, d6 @ F = A1*B 379*0a6a1f1dSLionel Sambuc vext.8 d0, d6, d6, #1 @ B1 380*0a6a1f1dSLionel Sambuc vmull.p8 q0, d26, d0 @ E = A*B1 381*0a6a1f1dSLionel Sambuc vext.8 d18, d26, d26, #2 @ A2 382*0a6a1f1dSLionel Sambuc vmull.p8 q9, d18, d6 @ H = A2*B 383*0a6a1f1dSLionel Sambuc vext.8 d22, d6, d6, #2 @ B2 384*0a6a1f1dSLionel Sambuc vmull.p8 q11, d26, d22 @ G = A*B2 385*0a6a1f1dSLionel Sambuc vext.8 d20, d26, d26, #3 @ A3 386*0a6a1f1dSLionel Sambuc veor q8, q8, q0 @ L = E + F 387*0a6a1f1dSLionel Sambuc vmull.p8 q10, d20, d6 @ J = A3*B 388*0a6a1f1dSLionel Sambuc vext.8 d0, d6, d6, #3 @ B3 389*0a6a1f1dSLionel Sambuc veor q9, q9, q11 @ M = G + H 390*0a6a1f1dSLionel Sambuc vmull.p8 q0, d26, d0 @ I = A*B3 391*0a6a1f1dSLionel Sambuc veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 392*0a6a1f1dSLionel Sambuc vand d17, d17, d29 393*0a6a1f1dSLionel Sambuc vext.8 d22, d6, d6, #4 @ B4 394*0a6a1f1dSLionel Sambuc veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 395*0a6a1f1dSLionel Sambuc vand d19, d19, d30 396*0a6a1f1dSLionel Sambuc vmull.p8 q11, d26, d22 @ K = A*B4 397*0a6a1f1dSLionel Sambuc veor q10, q10, q0 @ N = I + J 398*0a6a1f1dSLionel Sambuc veor d16, d16, d17 399*0a6a1f1dSLionel Sambuc veor d18, d18, d19 400*0a6a1f1dSLionel Sambuc veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 401*0a6a1f1dSLionel Sambuc vand d21, d21, d31 402*0a6a1f1dSLionel Sambuc vext.8 q8, q8, q8, #15 403*0a6a1f1dSLionel Sambuc veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 404*0a6a1f1dSLionel Sambuc vmov.i64 d23, #0 405*0a6a1f1dSLionel Sambuc vext.8 q9, q9, q9, #14 406*0a6a1f1dSLionel Sambuc veor d20, d20, d21 407*0a6a1f1dSLionel Sambuc vmull.p8 q0, d26, d6 @ D = A*B 408*0a6a1f1dSLionel Sambuc vext.8 q11, q11, q11, #12 409*0a6a1f1dSLionel Sambuc vext.8 q10, q10, q10, #13 410*0a6a1f1dSLionel Sambuc veor q8, q8, q9 411*0a6a1f1dSLionel Sambuc veor q10, q10, q11 412*0a6a1f1dSLionel Sambuc veor q0, q0, q8 413*0a6a1f1dSLionel Sambuc veor q0, q0, q10 414*0a6a1f1dSLionel Sambuc veor d6,d6,d7 @ Karatsuba pre-processing 415*0a6a1f1dSLionel Sambuc vext.8 d16, d28, d28, #1 @ A1 416*0a6a1f1dSLionel Sambuc vmull.p8 q8, d16, d6 @ F = A1*B 417*0a6a1f1dSLionel Sambuc vext.8 d2, d6, d6, #1 @ B1 418*0a6a1f1dSLionel Sambuc vmull.p8 q1, d28, d2 @ E = A*B1 419*0a6a1f1dSLionel Sambuc vext.8 d18, d28, d28, #2 @ A2 420*0a6a1f1dSLionel Sambuc vmull.p8 q9, d18, d6 @ H = A2*B 421*0a6a1f1dSLionel Sambuc vext.8 d22, d6, d6, #2 @ B2 422*0a6a1f1dSLionel Sambuc vmull.p8 q11, d28, d22 @ G = A*B2 423*0a6a1f1dSLionel Sambuc vext.8 d20, d28, d28, #3 @ A3 424*0a6a1f1dSLionel Sambuc veor q8, q8, q1 @ L = E + F 425*0a6a1f1dSLionel Sambuc vmull.p8 q10, d20, d6 @ J = A3*B 426*0a6a1f1dSLionel Sambuc vext.8 d2, d6, d6, #3 @ B3 427*0a6a1f1dSLionel Sambuc veor q9, q9, q11 @ M = G + H 428*0a6a1f1dSLionel Sambuc vmull.p8 q1, d28, d2 @ I = A*B3 429*0a6a1f1dSLionel Sambuc veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 430*0a6a1f1dSLionel Sambuc vand d17, d17, d29 431*0a6a1f1dSLionel Sambuc vext.8 d22, d6, d6, #4 @ B4 432*0a6a1f1dSLionel Sambuc veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 433*0a6a1f1dSLionel Sambuc vand d19, d19, d30 434*0a6a1f1dSLionel Sambuc vmull.p8 q11, d28, d22 @ K = A*B4 435*0a6a1f1dSLionel Sambuc veor q10, q10, q1 @ N = I + J 436*0a6a1f1dSLionel Sambuc veor d16, d16, d17 437*0a6a1f1dSLionel Sambuc veor d18, d18, d19 438*0a6a1f1dSLionel Sambuc veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 439*0a6a1f1dSLionel Sambuc vand d21, d21, d31 440*0a6a1f1dSLionel Sambuc vext.8 q8, q8, q8, #15 441*0a6a1f1dSLionel Sambuc veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 442*0a6a1f1dSLionel Sambuc vmov.i64 d23, #0 443*0a6a1f1dSLionel Sambuc vext.8 q9, q9, q9, #14 444*0a6a1f1dSLionel Sambuc veor d20, d20, d21 445*0a6a1f1dSLionel Sambuc vmull.p8 q1, d28, d6 @ D = A*B 446*0a6a1f1dSLionel Sambuc vext.8 q11, q11, q11, #12 447*0a6a1f1dSLionel Sambuc vext.8 q10, q10, q10, #13 448*0a6a1f1dSLionel Sambuc veor q8, q8, q9 449*0a6a1f1dSLionel Sambuc veor q10, q10, q11 450*0a6a1f1dSLionel Sambuc veor q1, q1, q8 451*0a6a1f1dSLionel Sambuc veor q1, q1, q10 452*0a6a1f1dSLionel Sambuc vext.8 d16, d27, d27, #1 @ A1 453*0a6a1f1dSLionel Sambuc vmull.p8 q8, d16, d7 @ F = A1*B 454*0a6a1f1dSLionel Sambuc vext.8 d4, d7, d7, #1 @ B1 455*0a6a1f1dSLionel Sambuc vmull.p8 q2, d27, d4 @ E = A*B1 456*0a6a1f1dSLionel Sambuc vext.8 d18, d27, d27, #2 @ A2 457*0a6a1f1dSLionel Sambuc vmull.p8 q9, d18, d7 @ H = A2*B 458*0a6a1f1dSLionel Sambuc vext.8 d22, d7, d7, #2 @ B2 459*0a6a1f1dSLionel Sambuc vmull.p8 q11, d27, d22 @ G = A*B2 460*0a6a1f1dSLionel Sambuc vext.8 d20, d27, d27, #3 @ A3 461*0a6a1f1dSLionel Sambuc veor q8, q8, q2 @ L = E + F 462*0a6a1f1dSLionel Sambuc vmull.p8 q10, d20, d7 @ J = A3*B 463*0a6a1f1dSLionel Sambuc vext.8 d4, d7, d7, #3 @ B3 464*0a6a1f1dSLionel Sambuc veor q9, q9, q11 @ M = G + H 465*0a6a1f1dSLionel Sambuc vmull.p8 q2, d27, d4 @ I = A*B3 466*0a6a1f1dSLionel Sambuc veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 467*0a6a1f1dSLionel Sambuc vand d17, d17, d29 468*0a6a1f1dSLionel Sambuc vext.8 d22, d7, d7, #4 @ B4 469*0a6a1f1dSLionel Sambuc veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 470*0a6a1f1dSLionel Sambuc vand d19, d19, d30 471*0a6a1f1dSLionel Sambuc vmull.p8 q11, d27, d22 @ K = A*B4 472*0a6a1f1dSLionel Sambuc veor q10, q10, q2 @ N = I + J 473*0a6a1f1dSLionel Sambuc veor d16, d16, d17 474*0a6a1f1dSLionel Sambuc veor d18, d18, d19 475*0a6a1f1dSLionel Sambuc veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 476*0a6a1f1dSLionel Sambuc vand d21, d21, d31 477*0a6a1f1dSLionel Sambuc vext.8 q8, q8, q8, #15 478*0a6a1f1dSLionel Sambuc veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 479*0a6a1f1dSLionel Sambuc vmov.i64 d23, #0 480*0a6a1f1dSLionel Sambuc vext.8 q9, q9, q9, #14 481*0a6a1f1dSLionel Sambuc veor d20, d20, d21 482*0a6a1f1dSLionel Sambuc vmull.p8 q2, d27, d7 @ D = A*B 483*0a6a1f1dSLionel Sambuc vext.8 q11, q11, q11, #12 484*0a6a1f1dSLionel Sambuc vext.8 q10, q10, q10, #13 485*0a6a1f1dSLionel Sambuc veor q8, q8, q9 486*0a6a1f1dSLionel Sambuc veor q10, q10, q11 487*0a6a1f1dSLionel Sambuc veor q2, q2, q8 488*0a6a1f1dSLionel Sambuc veor q2, q2, q10 489*0a6a1f1dSLionel Sambuc veor q1,q1,q0 @ Karatsuba post-processing 490*0a6a1f1dSLionel Sambuc veor q1,q1,q2 491*0a6a1f1dSLionel Sambuc veor d1,d1,d2 492*0a6a1f1dSLionel Sambuc veor d4,d4,d3 @ Xh|Xl - 256-bit result 493*0a6a1f1dSLionel Sambuc 494*0a6a1f1dSLionel Sambuc @ equivalent of reduction_avx from ghash-x86_64.pl 495*0a6a1f1dSLionel Sambuc vshl.i64 q9,q0,#57 @ 1st phase 496*0a6a1f1dSLionel Sambuc vshl.i64 q10,q0,#62 497*0a6a1f1dSLionel Sambuc veor q10,q10,q9 @ 498*0a6a1f1dSLionel Sambuc vshl.i64 q9,q0,#63 499*0a6a1f1dSLionel Sambuc veor q10, q10, q9 @ 500*0a6a1f1dSLionel Sambuc veor d1,d1,d20 @ 501*0a6a1f1dSLionel Sambuc veor d4,d4,d21 502*0a6a1f1dSLionel Sambuc 503*0a6a1f1dSLionel Sambuc vshr.u64 q10,q0,#1 @ 2nd phase 504*0a6a1f1dSLionel Sambuc veor q2,q2,q0 505*0a6a1f1dSLionel Sambuc veor q0,q0,q10 @ 506*0a6a1f1dSLionel Sambuc vshr.u64 q10,q10,#6 507*0a6a1f1dSLionel Sambuc vshr.u64 q0,q0,#1 @ 508*0a6a1f1dSLionel Sambuc veor q0,q0,q2 @ 509*0a6a1f1dSLionel Sambuc veor q0,q0,q10 @ 510*0a6a1f1dSLionel Sambuc 511*0a6a1f1dSLionel Sambuc subs r3,#16 512*0a6a1f1dSLionel Sambuc bne .Loop_neon 513*0a6a1f1dSLionel Sambuc 514*0a6a1f1dSLionel Sambuc#ifdef __ARMEL__ 515*0a6a1f1dSLionel Sambuc vrev64.8 q0,q0 516*0a6a1f1dSLionel Sambuc#endif 517*0a6a1f1dSLionel Sambuc sub r0,#16 518*0a6a1f1dSLionel Sambuc vst1.64 d1,[r0,:64]! @ write out Xi 519*0a6a1f1dSLionel Sambuc vst1.64 d0,[r0,:64] 520*0a6a1f1dSLionel Sambuc 521*0a6a1f1dSLionel Sambuc RET @ bx lr 522*0a6a1f1dSLionel Sambuc.size gcm_ghash_neon,.-gcm_ghash_neon 523*0a6a1f1dSLionel Sambuc#endif 524*0a6a1f1dSLionel Sambuc.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>" 525*0a6a1f1dSLionel Sambuc.align 2 526