1dnl ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3dnl Copyright 2012-2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C Cortex-A53 ? 35C Cortex-A57 ? 36 37define(`ap', x0) 38define(`n', x1) 39 40changecom(@&*$) 41 42C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 43 44C TODO 45C * An alternative inner loop which could run at 0.722 c/l: 46C adds x8, x8, x2 47C adcs x9, x9, x3 48C ldp x2, x3, [ap, #-32] 49C adcs x10, x10, x4 50C adc x12, x12, xzr 51C adds x8, x8, x5 52C ldp x4, x5, [ap, #-16] 53C sub n, n, #6 54C adcs x9, x9, x6 55C adcs x10, x10, x7 56C ldp x6, x7, [ap], #48 57C adc x12, x12, xzr 58C tbz n, #63, L(top) 59 60ASM_START() 61 TEXT 62 ALIGN(32) 63PROLOGUE(mpn_mod_34lsub1) 64 subs n, n, #3 65 mov x8, #0 66 b.lt L(le2) C n <= 2 67 68 ldp x2, x3, [ap, #0] 69 ldr x4, [ap, #16] 70 add ap, ap, #24 71 subs n, n, #3 72 b.lt L(sum) C n <= 5 73 cmn x0, #0 C clear carry 74 75L(top): ldp x5, x6, [ap, #0] 76 ldr x7, [ap, #16] 77 add ap, ap, #24 78 sub n, n, #3 79 adcs x2, x2, x5 80 adcs x3, x3, x6 81 adcs x4, x4, x7 82 tbz n, #63, L(top) 83 84 adc x8, xzr, xzr C x8 <= 1 85 86L(sum): cmn n, #2 87 mov x5, #0 88 b.lo 1f 89 ldr x5, [ap], #8 901: mov x6, #0 91 b.ls 1f 92 ldr x6, [ap], #8 931: adds x2, x2, x5 94 adcs x3, x3, x6 95 adcs x4, x4, xzr 96 adc x8, x8, xzr C x8 <= 2 97 98L(sum2): 99 and x0, x2, #0xffffffffffff 100 add x0, x0, x2, lsr #48 101 add x0, x0, x8 102 103 lsl x8, x3, #16 104 and x1, x8, #0xffffffffffff 105 add x0, x0, x1 106 add x0, x0, x3, lsr #32 107 108 lsl x8, x4, #32 109 and x1, x8, #0xffffffffffff 110 add x0, x0, x1 111 add x0, x0, x4, lsr #16 112 ret 113 114L(le2): cmn n, #1 115 b.ne L(1) 116 ldp x2, x3, [ap] 117 mov x4, #0 118 b L(sum2) 119L(1): ldr x2, [ap] 120 and x0, x2, #0xffffffffffff 121 add x0, x0, x2, lsr #48 122 ret 123EPILOGUE() 124