1/* $NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <arm/asm.h> 30 31RCSID("$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $") 32 33 .fpu neon 34 35 .text 36 .p2align 2 37.Lconstants_addr: 38 .long .Lconstants - . 39 40 .section .rodata 41 .p2align 5 42.Lconstants: 43 44.Linv_inva: /* inv and inva must be consecutive */ 45 .type inv,_ASM_TYPE_OBJECT 46inv: 47 .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E 48 .byte 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04 49END(inv) 50 51 .type inva,_ASM_TYPE_OBJECT 52inva: 53 .byte 0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01 54 .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03 55END(inva) 56 57 .type mc,_ASM_TYPE_OBJECT 58mc: 59 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 forward */ 60 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C 61 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 backward */ 62 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E 63 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 forward */ 64 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 65 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 backward */ 66 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A 67 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 forward */ 68 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 69 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 backward */ 70 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 71.Lmc_forward_3: 72 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 forward */ 73 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 74 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 backward */ 75 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 76END(mc) 77 78 .type sr,_ASM_TYPE_OBJECT 79sr: 80 .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */ 81 .byte 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F 82 83 .byte 0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */ 84 .byte 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B 85 86 .byte 0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */ 87 .byte 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07 88 89 .byte 0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */ 90 .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03 91END(sr) 92 93 .type ipt,_ASM_TYPE_OBJECT 94ipt: 95 .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 /* lo */ 96 .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA 97 .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */ 98 .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD 99END(ipt) 100 101 .type sb1,_ASM_TYPE_OBJECT 102sb1: 103 .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */ 104 .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5 105 .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */ 106 .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B 107END(sb1) 108 109 .type sb2,_ASM_TYPE_OBJECT 110sb2: 111 .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */ 112 .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E 113 .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */ 114 .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2 115END(sb2) 116 117 .type sbo,_ASM_TYPE_OBJECT 118sbo: 119 .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */ 120 .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15 121 .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */ 122 .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E 123END(sbo) 124 125 .type dipt,_ASM_TYPE_OBJECT 126dipt: 127 .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F /* lo */ 128 .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15 129 .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 /* hi */ 130 .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12 131END(dipt) 132 133 .type dsb9,_ASM_TYPE_OBJECT 134dsb9: 135 .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 /* 0 */ 136 .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA 137 .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 /* 1 */ 138 .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72 139END(dsb9) 140 141 .type dsbd,_ASM_TYPE_OBJECT 142dsbd: 143 .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D /* 0 */ 144 .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5 145 .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C /* 1 */ 146 .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29 147END(dsbd) 148 149 .type dsbb,_ASM_TYPE_OBJECT 150dsbb: 151 .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0 /* 0 */ 152 .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60 153 .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1 /* 1 */ 154 .byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3 155END(dsbb) 156 157 .type dsbe,_ASM_TYPE_OBJECT 158dsbe: 159 .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46 /* 0 */ 160 .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22 161 .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C /* 1 */ 162 .byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94 163END(dsbe) 164 165 .type dsbo,_ASM_TYPE_OBJECT 166dsbo: 167 .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13 /* 0 */ 168 .byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7 169 .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12 /* 1 */ 170 .byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA 171END(dsbo) 172 173/* 174 * aes_neon_enc1(enc, x, nrounds) 175 * 176 * With -mfloat-abi=hard: 177 * 178 * uint8x16_t@q0 179 * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0, 180 * unsigned nrounds@r1) 181 * 182 * With -mfloat-abi=soft(fp) (i.e., __SOFTFP__): 183 * 184 * uint8x16_t@(r0,r1,r2,r3) 185 * aes_neon_enc1(const struct aesenc *enc@r0, 186 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8]) 187 */ 188ENTRY(aes_neon_enc1) 189#ifdef __SOFTFP__ 190#ifdef __ARM_BIG_ENDIAN 191 vmov d0, r3, r2 /* d0 := x lo */ 192#else 193 vmov d0, r2, r3 /* d0 := x lo */ 194#endif 195 vldr d1, [sp] /* d1 := x hi */ 196 ldr r1, [sp, #8] /* r1 := nrounds */ 197#endif 198 push {r4, r5, r6, r8, r10, lr} 199 vpush {d8-d15} 200 201 /* 202 * r3: rmod4 203 * r4: mc 204 * r6,r8,r10,ip: temporaries 205 * q0={d0-d1}: x/ak/A 206 * q1={d2-d3}: 0x0f0f... 207 * q2={d4-d5}: lo/k/j/io 208 * q3={d6-d7}: hi/i/jo 209 * q4={d8-d9}: iptlo 210 * q5={d10-d11}: ipthi 211 * q6={d12-d13}: sb1[0]/sbo[0] 212 * q7={d14-d15}: sb1[1]/sbo[1] 213 * q8={d16-d17}: sb2[0] 214 * q9={d18-d19}: sb2[1] 215 * q10={d20-d21}: inv 216 * q11={d22-d23}: inva 217 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc[rmod4].backward 218 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc[rmod4].forward 219 * q14={d28-d29}: rk/A2/A2_B_D 220 * q15={d30-d31}: A2_B/sr[rmod4] 221 */ 222 223 /* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */ 224 ldr ip, .Lconstants_addr 225 adr r10, .Lconstants_addr 226 227 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ 228 movw r3, #0 229 vmov.i8 q1, #0x0f 230 231 /* ip := .Lconstants */ 232 add ip, ip, r10 233 234 /* (q4, q5) := (iptlo, ipthi) */ 235 add r6, ip, #(ipt - .Lconstants) 236 vld1.8 {q4-q5}, [r6 :256] 237 238 /* load the rest of the constants */ 239 add r4, ip, #(sb1 - .Lconstants) 240 add r6, ip, #(sb2 - .Lconstants) 241 add r8, ip, #(.Linv_inva - .Lconstants) 242 vld1.8 {q6-q7}, [r4 :256] /* q6 = sb1[0], q7 = sb1[1] */ 243 vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */ 244 vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */ 245 246 /* r4 := mc */ 247 add r4, ip, #(mc - .Lconstants) 248 249 /* (q2, q3) := (lo, hi) */ 250 vshr.u8 q3, q0, #4 251 vand q2, q0, q1 /* q2 := x & 0x0f0f... */ 252 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ 253 254 /* (q2, q3) := (iptlo(lo), ipthi(hi)) */ 255 vtbl.8 d4, {q4}, d4 256 vtbl.8 d5, {q4}, d5 257 vtbl.8 d6, {q5}, d6 258 vtbl.8 d7, {q5}, d7 259 260 /* q0 := rk[0] + iptlo(lo) + ipthi(hi) */ 261 veor q0, q14, q2 262 veor q0, q0, q3 263 264 b 2f 265 266 _ALIGN_TEXT 2671: vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ 268 269 /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */ 270 vtbl.8 d24, {q6}, d4 271 vtbl.8 d25, {q6}, d5 272 vtbl.8 d26, {q7}, d6 273 vtbl.8 d27, {q7}, d7 274 veor q0, q14, q12 275 veor q0, q0, q13 276 277 /* q14 := A2 = sb2_0[io] + sb2_1[jo] */ 278 vtbl.8 d24, {q8}, d4 279 vtbl.8 d25, {q8}, d5 280 vtbl.8 d26, {q9}, d6 281 vtbl.8 d27, {q9}, d7 282 add r6, r4, r3, lsl #5 /* r6 := &mc[rmod4] */ 283 veor q14, q12, q13 284 285 /* (q12, q13) := (mc[rmod4].forward, mc[rmod4].backward) */ 286 vld1.8 {q12-q13}, [r6 :256] 287 288 /* q15 := A2_B = A2 + A(mcf) */ 289 vtbl.8 d30, {q0}, d24 290 vtbl.8 d31, {q0}, d25 291 veor q15, q15, q14 292 293 /* q14 := A2_B_D = A2_B + A(mcb) */ 294 vtbl.8 d28, {q0}, d26 295 vtbl.8 d29, {q0}, d27 296 veor q14, q14, q15 297 298 /* q0 := x = A2_B_D + A2_B(mcf) */ 299 vtbl.8 d0, {q15}, d24 300 vtbl.8 d1, {q15}, d25 301 veor q0, q0, q14 302 3032: /* 304 * SubBytes 305 */ 306 307 /* (q2, q3) := (k, i) */ 308 vshr.u8 q3, q0, #4 309 vand q2, q0, q1 /* q2 := x & 0x0f0f... */ 310 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ 311 312 /* q0 := a/k */ 313 vtbl.8 d0, {q11}, d4 314 vtbl.8 d1, {q11}, d5 315 316 /* q2 := j = i + k */ 317 veor q2, q3, q2 318 319 /* q12 := ir = 1/i */ 320 vtbl.8 d24, {q10}, d6 321 vtbl.8 d25, {q10}, d7 322 323 /* q13 := jr = 1/j */ 324 vtbl.8 d26, {q10}, d4 325 vtbl.8 d27, {q10}, d5 326 327 /* q12 := iak = 1/i + a/k */ 328 veor q12, q12, q0 329 330 /* q13 := jak = 1/j + a/k */ 331 veor q13, q13, q0 332 333 /* q12 := iakr = 1/(1/i + a/k) */ 334 vtbl.8 d24, {q10}, d24 335 vtbl.8 d25, {q10}, d25 336 337 /* q13 := jakr = 1/(1/j + a/k) */ 338 vtbl.8 d26, {q10}, d26 339 vtbl.8 d27, {q10}, d27 340 341 /* q2 := io = j + 1/(1/i + a/k) */ 342 veor q2, q2, q12 343 344 /* q3 := jo = i + 1/(1/j + a/k) */ 345 veor q3, q3, q13 346 347 /* advance round */ 348 add r3, r3, #1 349 subs r1, r1, #1 350 and r3, r3, #3 351 bne 1b 352 353 /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */ 354 add r8, ip, #(sr - .Lconstants) 355 add r6, ip, #(sbo - .Lconstants) 356 add r8, r8, r3, lsl #4 357 vld1.8 {q6-q7}, [r6 :256] 358 vld1.8 {q15}, [r8 :128] 359 360 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ 361 362 /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */ 363 vtbl.8 d4, {q6}, d4 364 vtbl.8 d5, {q6}, d5 365 vtbl.8 d6, {q7}, d6 366 vtbl.8 d7, {q7}, d7 367 368 /* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */ 369 veor q2, q2, q14 370 veor q2, q2, q3 371 372 /* q0 := x(sr[rmod4]) */ 373 vtbl.8 d0, {q2}, d30 374 vtbl.8 d1, {q2}, d31 375 376 vpop {d8-d15} 377 pop {r4, r5, r6, r8, r10, lr} 378#ifdef __SOFTFP__ 379#ifdef __ARM_BIG_ENDIAN 380 vmov r1, r0, d0 381 vmov r3, r2, d1 382#else 383 vmov r0, r1, d0 384 vmov r2, r3, d1 385#endif 386#endif 387 bx lr 388END(aes_neon_enc1) 389 390/* 391 * aes_neon_dec1(dec, x, nrounds) 392 * 393 * With -mfloat-abi=hard: 394 * 395 * uint8x16_t@q0 396 * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0, 397 * unsigned nrounds@r1) 398 * 399 * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'): 400 * 401 * uint8x16_t@(r0,r1,r2,r3) 402 * aes_neon_dec1(const struct aesdec *dec@r0, 403 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8]) 404 */ 405ENTRY(aes_neon_dec1) 406#ifdef __SOFTFP__ 407#ifdef __ARM_BIG_ENDIAN 408 vmov d0, r3, r2 /* d0 := x lo */ 409#else 410 vmov d0, r2, r3 /* d0 := x lo */ 411#endif 412 vldr d1, [sp] /* d1 := x hi */ 413 ldr r1, [sp, #8] /* r1 := nrounds */ 414#endif 415 push {r4, r5, r6, r8, r10, lr} 416 vpush {d8-d15} 417 418 /* 419 * r3: 3 & ~(nrounds - 1) 420 * r4: dsbd 421 * r5: dsbe 422 * r6,r8,r10,ip: temporaries 423 * q0={d0-d1}: x/ak 424 * q1={d2-d3}: 0x0f0f... 425 * q2={d4-d5}: lo/k/j/io 426 * q3={d6-d7}: hi/i/jo 427 * q4={d8-d9}: diptlo/dsb9[0] 428 * q5={d10-d11}: dipthi/dsb9[1] 429 * q6={d12-d13}: dsbb[0]/dsbo[0] 430 * q7={d14-d15}: dsbb[1]/dsbo[1] 431 * q8={d16-d17}: dsbd[0]/dsbe[0] 432 * q9={d18-d19}: dsbd[1]/dsbe[0] 433 * q10={d20-d21}: inv 434 * q11={d22-d23}: inva 435 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io) 436 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo) 437 * q14={d28-d29}: rk/xmc 438 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)] 439 */ 440 441 /* ip := .Lconstants - .Lconstants_addr, r10 := .Lconstants_addr */ 442 ldr ip, .Lconstants_addr 443 adr r10, .Lconstants_addr 444 445 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ 446 rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ 447 vmov.i8 q1, #0x0f 448 and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ 449 450 /* ip := .Lconstants */ 451 add ip, ip, r10 452 453 /* (q4, q5) := (diptlo, dipthi) */ 454 add r6, ip, #(dipt - .Lconstants) 455 vld1.8 {q4-q5}, [r6 :256] 456 457 /* load the rest of the constants */ 458 add r4, ip, #(dsbb - .Lconstants) 459 add r6, ip, #(.Linv_inva - .Lconstants) 460 add r8, ip, #(.Lmc_forward_3 - .Lconstants) 461 vld1.8 {q6-q7}, [r4 :256] /* q6 := dsbb[0], q7 := dsbb[1] */ 462 vld1.8 {q10-q11}, [r6 :256] /* q10 := inv, q11 := inva */ 463 vld1.8 {q15}, [r8 :128] /* q15 := mc[3].forward */ 464 465 /* (q2, q3) := (lo, hi) */ 466 vshr.u8 q3, q0, #4 467 vand q2, q0, q1 /* q2 := x & 0x0f0f... */ 468 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ 469 470 /* (q2, q3) := (diptlo(lo), dipthi(hi)) */ 471 vtbl.8 d4, {q4}, d4 472 vtbl.8 d5, {q4}, d5 473 vtbl.8 d6, {q5}, d6 474 vtbl.8 d7, {q5}, d7 475 476 /* load dsb9 */ 477 add r4, ip, #(dsb9 - .Lconstants) 478 vld1.8 {q4-q5}, [r4 :256] /* q4 := dsb9[0], q5 := dsb9[1] */ 479 480 /* r4 := dsbd, r5 := dsbe */ 481 add r4, ip, #(dsbd - .Lconstants) 482 add r5, ip, #(dsbe - .Lconstants) 483 484 /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */ 485 veor q0, q14, q2 486 veor q0, q0, q3 487 488 b 2f 489 490 _ALIGN_TEXT 4911: /* load dsbd */ 492 vld1.8 {q8-q9}, [r4 :256] /* q8 := dsbd[0], q9 := dsbd[1] */ 493 494 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ 495 496 /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */ 497 vtbl.8 d24, {q4}, d4 498 vtbl.8 d25, {q4}, d5 499 vtbl.8 d26, {q5}, d6 500 vtbl.8 d27, {q5}, d7 501 veor q0, q14, q12 502 veor q0, q0, q13 503 504 /* q14 := x(mc) */ 505 vtbl.8 d28, {q0}, d30 506 vtbl.8 d29, {q0}, d31 507 508 /* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */ 509 vtbl.8 d24, {q8}, d4 510 vtbl.8 d25, {q8}, d5 511 vtbl.8 d26, {q9}, d6 512 vtbl.8 d27, {q9}, d7 513 veor q0, q14, q12 514 veor q0, q0, q13 515 516 /* load dsbe */ 517 vld1.8 {q8-q9}, [r5 :256] /* q8 := dsbe[0], q9 := dsbe[1] */ 518 519 /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */ 520 vtbl.8 d28, {q0}, d30 521 vtbl.8 d29, {q0}, d31 522 vtbl.8 d24, {q6}, d4 523 vtbl.8 d25, {q6}, d5 524 vtbl.8 d26, {q7}, d6 525 vtbl.8 d27, {q7}, d7 526 veor q0, q14, q12 527 veor q0, q0, q13 528 529 /* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */ 530 vtbl.8 d28, {q0}, d30 531 vtbl.8 d29, {q0}, d31 532 vtbl.8 d24, {q8}, d4 533 vtbl.8 d25, {q8}, d5 534 vtbl.8 d26, {q9}, d6 535 vtbl.8 d27, {q9}, d7 536 veor q0, q14, q12 537 veor q0, q0, q13 538 539 /* q15 := mc := mc <<< 12*8 */ 540 vext.8 q15, q15, q15, #12 541 5422: /* 543 * SubBytes 544 */ 545 546 /* (q2, q3) := (k, i) */ 547 vshr.u8 q3, q0, #4 548 vand q2, q0, q1 /* q2 := x & 0x0f0f... */ 549 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */ 550 551 /* q0 := a/k */ 552 vtbl.8 d0, {q11}, d4 553 vtbl.8 d1, {q11}, d5 554 555 /* q2 := j = i + k */ 556 veor q2, q3, q2 557 558 /* q12 := ir = 1/i */ 559 vtbl.8 d24, {q10}, d6 560 vtbl.8 d25, {q10}, d7 561 562 /* q13 := jr = 1/j */ 563 vtbl.8 d26, {q10}, d4 564 vtbl.8 d27, {q10}, d5 565 566 /* q12 := iak = 1/i + a/k */ 567 veor q12, q12, q0 568 569 /* q13 := jak = 1/j + a/k */ 570 veor q13, q13, q0 571 572 /* q12 := iakr = 1/(1/i + a/k) */ 573 vtbl.8 d24, {q10}, d24 574 vtbl.8 d25, {q10}, d25 575 576 /* q13 := jakr = 1/(1/j + a/k) */ 577 vtbl.8 d26, {q10}, d26 578 vtbl.8 d27, {q10}, d27 579 580 /* q2 := io = j + 1/(1/i + a/k) */ 581 veor q2, q2, q12 582 583 /* q3 := jo = i + 1/(1/j + a/k) */ 584 veor q3, q3, q13 585 586 /* advance round */ 587 subs r1, r1, #1 588 bne 1b 589 590 /* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */ 591 add r8, ip, #(sr - .Lconstants) 592 add r6, ip, #(dsbo - .Lconstants) 593 add r8, r8, r3, lsl #4 594 vld1.8 {q6-q7}, [r6 :256] 595 vld1.8 {q15}, [r8 :128] 596 597 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */ 598 599 /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */ 600 vtbl.8 d4, {q6}, d4 601 vtbl.8 d5, {q6}, d5 602 vtbl.8 d6, {q7}, d6 603 vtbl.8 d7, {q7}, d7 604 605 /* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */ 606 veor q2, q2, q14 607 veor q2, q2, q3 608 609 /* q0 := x(sr[i]) */ 610 vtbl.8 d0, {q2}, d30 611 vtbl.8 d1, {q2}, d31 612 613 vpop {d8-d15} 614 pop {r4, r5, r6, r8, r10, lr} 615#ifdef __SOFTFP__ 616#ifdef __ARM_BIG_ENDIAN 617 vmov r1, r0, d0 618 vmov r3, r2, d1 619#else 620 vmov r0, r1, d0 621 vmov r2, r3, d1 622#endif 623#endif 624 bx lr 625END(aes_neon_dec1) 626