1dnl ARM v4 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2012, 2017 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C norm unorm 37C 1176 13 18 38C Cortex-A5 8 12 39C Cortex-A7 10.5 18 40C Cortex-A8 14 15 41C Cortex-A9 10 12 not measured since latest edits 42C Cortex-A15 9 9 43C Cortex-A53 14 20 44 45C Architecture requirements: 46C v5 - 47C v5t - 48C v5te - 49C v6 - 50C v6t2 - 51C v7a - 52 53define(`rp', `r0') 54define(`up', `r1') 55define(`n', `r2') 56define(`d', `r3') 57define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 58define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 59 60define(`cy', `r7') 61define(`cnt', `r6') 62define(`tnc', `r8') 63 64ASM_START() 65PROLOGUE(mpn_bdiv_q_1) 66 tst d, #1 67 push {r6-r11} 68 mov cnt, #0 69 bne L(inv) 70 71C count trailing zeros 72 movs r10, d, lsl #16 73 moveq d, d, lsr #16 74 moveq cnt, #16 75 tst d, #0xff 76 moveq d, d, lsr #8 77 addeq cnt, cnt, #8 78 LEA( r10, ctz_tab) 79 and r11, d, #0xff 80 ldrb r10, [r10, r11] 81 mov d, d, lsr r10 82 add cnt, cnt, r10 83 84C binvert limb 85L(inv): LEA( r10, binvert_limb_table) 86 and r12, d, #254 87 ldrb r10, [r10, r12, lsr #1] 88 mul r12, r10, r10 89 mul r12, d, r12 90 rsb r12, r12, r10, lsl #1 91 mul r10, r12, r12 92 mul r10, d, r10 93 rsb r10, r10, r12, lsl #1 C r10 = inverse 94 b L(pi1) 95EPILOGUE() 96 97PROLOGUE(mpn_pi1_bdiv_q_1) 98 push {r6-r11} 99 100 ldr cnt, [sp, #28] 101 ldr r10, [sp, #24] 102 103L(pi1): ldr r11, [up], #4 C up[0] 104 cmp cnt, #0 105 mov cy, #0 106 bne L(unorm) 107 108L(norm): 109 subs n, n, #1 C set carry as side-effect 110 beq L(edn) 111 112 ALIGN(16) 113L(tpn): sbcs cy, r11, cy 114 ldr r11, [up], #4 115 sub n, n, #1 116 mul r9, r10, cy 117 tst n, n 118 umull r12, cy, d, r9 119 str r9, [rp], #4 120 bne L(tpn) 121 122L(edn): sbc cy, r11, cy 123 mul r9, r10, cy 124 str r9, [rp] 125 pop {r6-r11} 126 return r14 127 128L(unorm): 129 rsb tnc, cnt, #32 130 mov r11, r11, lsr cnt 131 subs n, n, #1 C set carry as side-effect 132 beq L(edu) 133 134 ALIGN(16) 135L(tpu): ldr r12, [up], #4 136 orr r9, r11, r12, lsl tnc 137 mov r11, r12, lsr cnt 138 sbcs cy, r9, cy C critical path ->cy->cy-> 139 sub n, n, #1 140 mul r9, r10, cy C critical path ->cy->r9-> 141 tst n, n 142 umull r12, cy, d, r9 C critical path ->r9->cy-> 143 str r9, [rp], #4 144 bne L(tpu) 145 146L(edu): sbc cy, r11, cy 147 mul r9, r10, cy 148 str r9, [rp] 149 pop {r6-r11} 150 return r14 151EPILOGUE() 152 153 RODATA 154ctz_tab: 155 .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 156 .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 157 .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 158 .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 159 .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 160 .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 161 .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 162 .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 163