1dnl ARM mpn_addmul_3. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C StrongARM: - 37C XScale - 38C ARM11 4.33 39C Cortex-A5 3.28 40C Cortex-A7 3.25 41C Cortex-A8 3.17 42C Cortex-A9 2.125 43C Cortex-A15 2 44C Cortex-A17 2.11 45C Cortex-A53 4.18 46 47C TODO 48C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table, 49C avoiding the current multiply. 50C * Start the first multiply or multiplies early. 51 52define(`rp',`r0') 53define(`up',`r1') 54define(`n', `r2') 55define(`vp',`r3') 56 57define(`v0',`r4') define(`v1',`r5') define(`v2',`r6') 58define(`u0',`r3') define(`u1',`r14') 59define(`w0',`r7') define(`w1',`r8') define(`w2',`r9') 60define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12') 61 62 63ASM_START() 64PROLOGUE(mpn_addmul_3) 65 push { r4-r11, r14 } 66 67 ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32 68 ldm vp, { v0,v1,v2 } 69 mov cy0, #0 70 mov cy1, #0 71 mov cy2, #0 72 73C Tricky n mod 6 74 mul w0, w0, n C n * 3^{-1} mod 2^32 75 and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2 76 sub n, n, #3 77ifdef(`PIC',` 78 add pc, pc, w0, ror $28 79 nop 80 b L(b0) 81 b L(b2) 82 b L(b4) 83 .word 0xe7f000f0 C udf 84 b L(b3) 85 b L(b5) 86 b L(b1) 87',` 88 ldr pc, [pc, w0, ror $28] 89 nop 90 .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1) 91') 92 93L(b5): add up, up, #-8 94 ldr w1, [rp, #0] 95 ldr w2, [rp, #4] 96 ldr u1, [up, #8] 97 b L(lo5) 98 99L(b4): add rp, rp, #-4 100 add up, up, #-12 101 ldr w2, [rp, #4] 102 ldr w0, [rp, #8] 103 ldr u0, [up, #12] 104 b L(lo4) 105 106L(b3): add rp, rp, #-8 107 add up, up, #-16 108 ldr w0, [rp, #8] 109 ldr w1, [rp, #12] 110 ldr u1, [up, #16] 111 b L(lo3) 112 113L(b1): add rp, rp, #8 114 ldr w2, [rp, #-8] 115 ldr w0, [rp, #-4] 116 ldr u1, [up, #0] 117 b L(lo1) 118 119L(b0): add rp, rp, #4 120 add up, up, #-4 121 ldr w0, [rp, #-4] 122 ldr w1, [rp, #0] 123 ldr u0, [up, #4] 124 b L(lo0) 125 126L(b2): add rp, rp, #12 127 add up, up, #4 128 ldr w1, [rp, #-12] 129 ldr w2, [rp, #-8] 130 ldr u0, [up, #-4] 131 132 ALIGN(16) 133L(top): ldr w0, [rp, #-4] 134 umaal w1, cy0, u0, v0 135 ldr u1, [up, #0] 136 umaal w2, cy1, u0, v1 137 str w1, [rp, #-12] 138 umaal w0, cy2, u0, v2 139L(lo1): ldr w1, [rp, #0] 140 umaal w2, cy0, u1, v0 141 ldr u0, [up, #4] 142 umaal w0, cy1, u1, v1 143 str w2, [rp, #-8] 144 umaal w1, cy2, u1, v2 145L(lo0): ldr w2, [rp, #4] 146 umaal w0, cy0, u0, v0 147 ldr u1, [up, #8] 148 umaal w1, cy1, u0, v1 149 str w0, [rp, #-4] 150 umaal w2, cy2, u0, v2 151L(lo5): ldr w0, [rp, #8] 152 umaal w1, cy0, u1, v0 153 ldr u0, [up, #12] 154 umaal w2, cy1, u1, v1 155 str w1, [rp, #0] 156 umaal w0, cy2, u1, v2 157L(lo4): ldr w1, [rp, #12] 158 umaal w2, cy0, u0, v0 159 ldr u1, [up, #16] 160 umaal w0, cy1, u0, v1 161 str w2, [rp, #4] 162 umaal w1, cy2, u0, v2 163L(lo3): ldr w2, [rp, #16] 164 umaal w0, cy0, u1, v0 165 ldr u0, [up, #20] 166 umaal w1, cy1, u1, v1 167 str w0, [rp, #8] 168 umaal w2, cy2, u1, v2 169L(lo2): subs n, n, #6 170 add up, up, #24 171 add rp, rp, #24 172 bge L(top) 173 174L(end): umaal w1, cy0, u0, v0 175 ldr u1, [up, #0] 176 umaal w2, cy1, u0, v1 177 str w1, [rp, #-12] 178 mov w0, #0 179 umaal w0, cy2, u0, v2 180 umaal w2, cy0, u1, v0 181 umaal w0, cy1, u1, v1 182 str w2, [rp, #-8] 183 umaal cy1, cy2, u1, v2 184 adds w0, w0, cy0 185 str w0, [rp, #-4] 186 adcs w1, cy1, #0 187 str w1, [rp, #0] 188 adc r0, cy2, #0 189 190 pop { r4-r11, pc } 191EPILOGUE() 192