1dnl ARM64 mpn_addmul_1 and mpn_submul_1 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2013, 2015, 2017 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C Cortex-A53 9.3-9.8 37C Cortex-A57 7.0 38C X-Gene 5.0 39 40C NOTES 41C * It is possible to keep the carry chain alive between the addition blocks 42C and thus avoid csinc, but only for addmul_1. Since that saves no time 43C on the tested pipelines, we keep addmul_1 and submul_1 similar. 44C * We could separate feed-in into 4 blocks, one for each residue (mod 4). 45C That is likely to save a few cycles. 46 47changecom(blah) 48 49define(`rp', `x0') 50define(`up', `x1') 51define(`n', `x2') 52define(`v0', `x3') 53 54ifdef(`OPERATION_addmul_1', ` 55 define(`ADDSUB', adds) 56 define(`ADDSUBC', adcs) 57 define(`COND', `cc') 58 define(`func', mpn_addmul_1)') 59ifdef(`OPERATION_submul_1', ` 60 define(`ADDSUB', subs) 61 define(`ADDSUBC', sbcs) 62 define(`COND', `cs') 63 define(`func', mpn_submul_1)') 64 65MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 66 67PROLOGUE(func) 68 adds x15, xzr, xzr 69 70 tbz n, #0, L(1) 71 72 ldr x4, [up],#8 73 mul x8, x4, v0 74 umulh x12, x4, v0 75 ldr x4, [rp] 76 ADDSUB x8, x4, x8 77 csinc x15, x12, x12, COND 78 str x8, [rp],#8 79 80L(1): tbz n, #1, L(2) 81 82 ldp x4, x5, [up],#16 83 mul x8, x4, v0 84 umulh x12, x4, v0 85 mul x9, x5, v0 86 umulh x13, x5, v0 87 adds x8, x8, x15 88 adcs x9, x9, x12 89 ldp x4, x5, [rp] 90 adc x15, x13, xzr 91 ADDSUB x8, x4, x8 92 ADDSUBC x9, x5, x9 93 csinc x15, x15, x15, COND 94 stp x8, x9, [rp],#16 95 96L(2): lsr n, n, #2 97 cbz n, L(le3) 98 ldp x4, x5, [up],#32 99 ldp x6, x7, [up,#-16] 100 b L(mid) 101L(le3): mov x0, x15 102 ret 103 104 ALIGN(16) 105L(top): ldp x4, x5, [up],#32 106 ldp x6, x7, [up,#-16] 107 ADDSUB x8, x16, x8 108 ADDSUBC x9, x17, x9 109 stp x8, x9, [rp],#32 110 ADDSUBC x10, x12, x10 111 ADDSUBC x11, x13, x11 112 stp x10, x11, [rp,#-16] 113 csinc x15, x15, x15, COND 114L(mid): sub n, n, #1 115 mul x8, x4, v0 116 umulh x12, x4, v0 117 mul x9, x5, v0 118 umulh x13, x5, v0 119 adds x8, x8, x15 120 mul x10, x6, v0 121 umulh x14, x6, v0 122 adcs x9, x9, x12 123 mul x11, x7, v0 124 umulh x15, x7, v0 125 adcs x10, x10, x13 126 ldp x16, x17, [rp] 127 adcs x11, x11, x14 128 ldp x12, x13, [rp,#16] 129 adc x15, x15, xzr 130 cbnz n, L(top) 131 132 ADDSUB x8, x16, x8 133 ADDSUBC x9, x17, x9 134 ADDSUBC x10, x12, x10 135 ADDSUBC x11, x13, x11 136 stp x8, x9, [rp] 137 stp x10, x11, [rp,#16] 138 csinc x0, x15, x15, COND 139 ret 140EPILOGUE() 141