1dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004, 2005, 2006, 2011, 2012 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C AMD K8,K9 10 26C AMD K10 10 27C Intel P4 33 28C Intel core2 13.25 29C Intel corei 14 30C Intel atom 42 31C VIA nano 43 32 33C A quick adoption of the 32-bit K7 code. 34 35 36C INPUT PARAMETERS 37C rp rdi 38C up rsi 39C n rdx 40C divisor rcx 41 42ABI_SUPPORT(DOS64) 43ABI_SUPPORT(STD64) 44 45ASM_START() 46 TEXT 47 ALIGN(16) 48PROLOGUE(mpn_divexact_1) 49 FUNC_ENTRY(4) 50 push %rbx 51 52 mov %rcx, %rax 53 xor R32(%rcx), R32(%rcx) C shift count 54 mov %rdx, %r8 55 56 bt $0, R32(%rax) 57 jnc L(evn) C skip bsfq unless divisor is even 58 59L(odd): mov %rax, %rbx 60 shr R32(%rax) 61 and $127, R32(%rax) C d/2, 7 bits 62 63 LEA( binvert_limb_table, %rdx) 64 65 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 66 67 mov %rbx, %r11 C d without twos 68 69 lea (%rax,%rax), R32(%rdx) C 2*inv 70 imul R32(%rax), R32(%rax) C inv*inv 71 imul R32(%rbx), R32(%rax) C inv*inv*d 72 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 73 74 lea (%rdx,%rdx), R32(%rax) C 2*inv 75 imul R32(%rdx), R32(%rdx) C inv*inv 76 imul R32(%rbx), R32(%rdx) C inv*inv*d 77 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 78 79 lea (%rax,%rax), %r10 C 2*inv 80 imul %rax, %rax C inv*inv 81 imul %rbx, %rax C inv*inv*d 82 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 83 84 lea (%rsi,%r8,8), %rsi C up end 85 lea -8(%rdi,%r8,8), %rdi C rp end 86 neg %r8 C -n 87 88 mov (%rsi,%r8,8), %rax C up[0] 89 90 inc %r8 91 jz L(one) 92 93 mov (%rsi,%r8,8), %rdx C up[1] 94 95 shrd R8(%rcx), %rdx, %rax 96 97 xor R32(%rbx), R32(%rbx) 98 jmp L(ent) 99 100L(evn): bsf %rax, %rcx 101 shr R8(%rcx), %rax 102 jmp L(odd) 103 104 ALIGN(8) 105L(top): 106 C rax q 107 C rbx carry bit, 0 or 1 108 C rcx shift 109 C rdx 110 C rsi up end 111 C rdi rp end 112 C r8 counter, limbs, negative 113 C r10 d^(-1) mod 2^64 114 C r11 d, shifted down 115 116 mul %r11 C carry limb in rdx 0 10 117 mov -8(%rsi,%r8,8), %rax C 118 mov (%rsi,%r8,8), %r9 C 119 shrd R8(%rcx), %r9, %rax C 120 nop C 121 sub %rbx, %rax C apply carry bit 122 setc %bl C 123 sub %rdx, %rax C apply carry limb 5 124 adc $0, %rbx C 6 125L(ent): imul %r10, %rax C 6 126 mov %rax, (%rdi,%r8,8) C 127 inc %r8 C 128 jnz L(top) 129 130 mul %r11 C carry limb in rdx 131 mov -8(%rsi), %rax C up high limb 132 shr R8(%rcx), %rax 133 sub %rbx, %rax C apply carry bit 134 sub %rdx, %rax C apply carry limb 135 imul %r10, %rax 136 mov %rax, (%rdi) 137 pop %rbx 138 FUNC_EXIT() 139 ret 140 141L(one): shr R8(%rcx), %rax 142 imul %r10, %rax 143 mov %rax, (%rdi) 144 pop %rbx 145 FUNC_EXIT() 146 ret 147 148EPILOGUE() 149