1dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. 2 3dnl Copyright 2001, 2002, 2004-2006, 2010-2012, 2017 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb cycles/limb 35C norm unorm 36C AMD K8,K9 11 11 37C AMD K10 11 11 38C AMD bull 13.5 14 39C AMD pile 14 15 40C AMD steam 41C AMD excavator 42C AMD bobcat 14 14 43C AMD jaguar 14.5 15 44C Intel P4 33 33 45C Intel core2 13.5 13.25 46C Intel NHM 14 14 47C Intel SBR 8 8.25 48C Intel IBR 7.75 7.85 49C Intel HWL 8 8 50C Intel BWL 8 8 51C Intel SKL 8 8 52C Intel atom 34 36 53C Intel SLM 13.7 13.5 54C VIA nano 19.25 19.25 needs re-measuring 55 56C INPUT PARAMETERS 57define(`rp', `%rdi') 58define(`up', `%rsi') 59define(`n', `%rdx') 60define(`d', `%rcx') 61define(`di', `%r8') C just mpn_pi1_bdiv_q_1 62define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 63 64ABI_SUPPORT(DOS64) 65ABI_SUPPORT(STD64) 66 67ASM_START() 68 TEXT 69 ALIGN(16) 70PROLOGUE(mpn_bdiv_q_1) 71 FUNC_ENTRY(4) 72 push %rbx 73 74 mov %rcx, %rax 75 xor R32(%rcx), R32(%rcx) C ncnt count 76 mov %rdx, %r10 77 78 bt $0, R32(%rax) 79 jnc L(evn) C skip bsf unless divisor is even 80 81L(odd): mov %rax, %rbx 82 shr R32(%rax) 83 and $127, R32(%rax) C d/2, 7 bits 84 85 LEA( binvert_limb_table, %rdx) 86 87 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 88 89 mov %rbx, %r11 C d without twos 90 91 lea (%rax,%rax), R32(%rdx) C 2*inv 92 imul R32(%rax), R32(%rax) C inv*inv 93 imul R32(%rbx), R32(%rax) C inv*inv*d 94 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 95 96 lea (%rdx,%rdx), R32(%rax) C 2*inv 97 imul R32(%rdx), R32(%rdx) C inv*inv 98 imul R32(%rbx), R32(%rdx) C inv*inv*d 99 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 100 101 lea (%rax,%rax), %r8 C 2*inv 102 imul %rax, %rax C inv*inv 103 imul %rbx, %rax C inv*inv*d 104 sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits 105 106 jmp L(pi1) 107 108L(evn): bsf %rax, %rcx 109 shr R8(%rcx), %rax 110 jmp L(odd) 111EPILOGUE() 112 113PROLOGUE(mpn_pi1_bdiv_q_1) 114 FUNC_ENTRY(4) 115IFDOS(` mov 56(%rsp), %r8 ') 116IFDOS(` mov 64(%rsp), %r9 ') 117 push %rbx 118 119 mov %rcx, %r11 C d 120 mov %rdx, %r10 C n 121 mov %r9, %rcx C ncnt 122 123L(pi1): mov (up), %rax C up[0] 124 125 dec %r10 126 jz L(one) 127 128 lea 8(up,%r10,8), up C up end 129 lea (rp,%r10,8), rp C rp end 130 neg %r10 C -n 131 132 test R32(%rcx), R32(%rcx) 133 jnz L(unorm) C branch if count != 0 134 xor R32(%rbx), R32(%rbx) 135 jmp L(nent) 136 137 ALIGN(8) 138L(ntop):mul %r11 C carry limb in rdx 0 10 139 mov -8(up,%r10,8), %rax C 140 sub %rbx, %rax C apply carry bit 141 setc R8(%rbx) C 142 sub %rdx, %rax C apply carry limb 5 143 adc $0, R32(%rbx) C 6 144L(nent):imul %r8, %rax C 6 145 mov %rax, (rp,%r10,8) C 146 inc %r10 C 147 jnz L(ntop) 148 149 mov -8(up), %r9 C up high limb 150 jmp L(com) 151 152L(unorm): 153 mov (up,%r10,8), %r9 C up[1] 154 shr R8(%rcx), %rax C 155 neg R32(%rcx) 156 shl R8(%rcx), %r9 C 157 neg R32(%rcx) 158 or %r9, %rax 159 xor R32(%rbx), R32(%rbx) 160 jmp L(uent) 161 162 ALIGN(8) 163L(utop):mul %r11 C carry limb in rdx 0 10 164 mov (up,%r10,8), %rax C 165 shl R8(%rcx), %rax C 166 neg R32(%rcx) 167 or %r9, %rax 168 sub %rbx, %rax C apply carry bit 169 setc R8(%rbx) C 170 sub %rdx, %rax C apply carry limb 5 171 adc $0, R32(%rbx) C 6 172L(uent):imul %r8, %rax C 6 173 mov (up,%r10,8), %r9 C 174 shr R8(%rcx), %r9 C 175 neg R32(%rcx) 176 mov %rax, (rp,%r10,8) C 177 inc %r10 C 178 jnz L(utop) 179 180L(com): mul %r11 C carry limb in rdx 181 sub %rbx, %r9 C apply carry bit 182 sub %rdx, %r9 C apply carry limb 183 imul %r8, %r9 184 mov %r9, (rp) 185 pop %rbx 186 FUNC_EXIT() 187 ret 188 189L(one): shr R8(%rcx), %rax 190 imul %r8, %rax 191 mov %rax, (rp) 192 pop %rbx 193 FUNC_EXIT() 194 ret 195EPILOGUE() 196