1dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by 2dnl 1-limb divisor, returning quotient only. 3 4dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009, 2011, 2012 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C AMD K8,K9 10 27C AMD K10 10 28C Intel P4 33 29C Intel core2 13.25 30C Intel corei 14 31C Intel atom 42 32C VIA nano ? 33 34 35C INPUT PARAMETERS 36define(`rp', `%rdi') 37define(`up', `%rsi') 38define(`n', `%rdx') 39define(`d', `%rcx') 40define(`di', `%r8') C just mpn_pi1_bdiv_q_1 41define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 42 43ABI_SUPPORT(DOS64) 44ABI_SUPPORT(STD64) 45 46ASM_START() 47 TEXT 48 ALIGN(16) 49PROLOGUE(mpn_bdiv_q_1) 50 FUNC_ENTRY(4) 51 push %rbx 52 53 mov %rcx, %rax 54 xor R32(%rcx), R32(%rcx) C ncnt count 55 mov %rdx, %r10 56 57 bt $0, R32(%rax) 58 jnc L(evn) C skip bsfq unless divisor is even 59 60L(odd): mov %rax, %rbx 61 shr R32(%rax) 62 and $127, R32(%rax) C d/2, 7 bits 63 64 LEA( binvert_limb_table, %rdx) 65 66 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 67 68 mov %rbx, %r11 C d without twos 69 70 lea (%rax,%rax), R32(%rdx) C 2*inv 71 imul R32(%rax), R32(%rax) C inv*inv 72 imul R32(%rbx), R32(%rax) C inv*inv*d 73 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 74 75 lea (%rdx,%rdx), R32(%rax) C 2*inv 76 imul R32(%rdx), R32(%rdx) C inv*inv 77 imul R32(%rbx), R32(%rdx) C inv*inv*d 78 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 79 80 lea (%rax,%rax), %r8 C 2*inv 81 imul %rax, %rax C inv*inv 82 imul %rbx, %rax C inv*inv*d 83 sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits 84 85 jmp L(com) 86 87L(evn): bsf %rax, %rcx 88 shr R8(%rcx), %rax 89 jmp L(odd) 90EPILOGUE() 91 92PROLOGUE(mpn_pi1_bdiv_q_1) 93 FUNC_ENTRY(4) 94IFDOS(` mov 56(%rsp), %r8 ') 95IFDOS(` mov 64(%rsp), %r9 ') 96 push %rbx 97 98 mov %rcx, %r11 C d 99 mov %rdx, %r10 C n 100 mov %r9, %rcx C ncnt 101 102L(com): mov (up), %rax C up[0] 103 104 dec %r10 105 jz L(one) 106 107 mov 8(up), %rdx C up[1] 108 lea (up,%r10,8), up C up end 109 lea (rp,%r10,8), rp C rp end 110 neg %r10 C -n 111 112 shrd R8(%rcx), %rdx, %rax 113 114 xor R32(%rbx), R32(%rbx) 115 jmp L(ent) 116 117 ALIGN(8) 118L(top): 119 C rax q 120 C rbx carry bit, 0 or 1 121 C rcx ncnt 122 C rdx 123 C r10 counter, limbs, negative 124 125 mul %r11 C carry limb in rdx 126 mov (up,%r10,8), %rax 127 mov 8(up,%r10,8), %r9 128 shrd R8(%rcx), %r9, %rax 129 nop 130 sub %rbx, %rax C apply carry bit 131 setc R8(%rbx) 132 sub %rdx, %rax C apply carry limb 133 adc $0, %rbx 134L(ent): imul %r8, %rax 135 mov %rax, (rp,%r10,8) 136 inc %r10 137 jnz L(top) 138 139 mul %r11 C carry limb in rdx 140 mov (up), %rax C up high limb 141 shr R8(%rcx), %rax 142 sub %rbx, %rax C apply carry bit 143 sub %rdx, %rax C apply carry limb 144 imul %r8, %rax 145 mov %rax, (rp) 146 pop %rbx 147 FUNC_EXIT() 148 ret 149 150L(one): shr R8(%rcx), %rax 151 imul %r8, %rax 152 mov %rax, (rp) 153 pop %rbx 154 FUNC_EXIT() 155 ret 156EPILOGUE() 157