1dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C AMD K8,K9 10 36C AMD K10 10 37C Intel P4 33 38C Intel core2 13.25 39C Intel corei 14 40C Intel atom 42 41C VIA nano 43 42 43C A quick adoption of the 32-bit K7 code. 44 45 46C INPUT PARAMETERS 47C rp rdi 48C up rsi 49C n rdx 50C divisor rcx 51 52ABI_SUPPORT(DOS64) 53ABI_SUPPORT(STD64) 54 55ASM_START() 56 TEXT 57 ALIGN(16) 58PROLOGUE(mpn_divexact_1) 59 FUNC_ENTRY(4) 60 push %rbx 61 62 mov %rcx, %rax 63 xor R32(%rcx), R32(%rcx) C shift count 64 mov %rdx, %r8 65 66 bt $0, R32(%rax) 67 jnc L(evn) C skip bsfq unless divisor is even 68 69L(odd): mov %rax, %rbx 70 shr R32(%rax) 71 and $127, R32(%rax) C d/2, 7 bits 72 73 LEA( binvert_limb_table, %rdx) 74 75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 76 77 mov %rbx, %r11 C d without twos 78 79 lea (%rax,%rax), R32(%rdx) C 2*inv 80 imul R32(%rax), R32(%rax) C inv*inv 81 imul R32(%rbx), R32(%rax) C inv*inv*d 82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 83 84 lea (%rdx,%rdx), R32(%rax) C 2*inv 85 imul R32(%rdx), R32(%rdx) C inv*inv 86 imul R32(%rbx), R32(%rdx) C inv*inv*d 87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 88 89 lea (%rax,%rax), %r10 C 2*inv 90 imul %rax, %rax C inv*inv 91 imul %rbx, %rax C inv*inv*d 92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 93 94 lea (%rsi,%r8,8), %rsi C up end 95 lea -8(%rdi,%r8,8), %rdi C rp end 96 neg %r8 C -n 97 98 mov (%rsi,%r8,8), %rax C up[0] 99 100 inc %r8 101 jz L(one) 102 103 mov (%rsi,%r8,8), %rdx C up[1] 104 105 shrd R8(%rcx), %rdx, %rax 106 107 xor R32(%rbx), R32(%rbx) 108 jmp L(ent) 109 110L(evn): bsf %rax, %rcx 111 shr R8(%rcx), %rax 112 jmp L(odd) 113 114 ALIGN(8) 115L(top): 116 C rax q 117 C rbx carry bit, 0 or 1 118 C rcx shift 119 C rdx 120 C rsi up end 121 C rdi rp end 122 C r8 counter, limbs, negative 123 C r10 d^(-1) mod 2^64 124 C r11 d, shifted down 125 126 mul %r11 C carry limb in rdx 0 10 127 mov -8(%rsi,%r8,8), %rax C 128 mov (%rsi,%r8,8), %r9 C 129 shrd R8(%rcx), %r9, %rax C 130 nop C 131 sub %rbx, %rax C apply carry bit 132 setc %bl C 133 sub %rdx, %rax C apply carry limb 5 134 adc $0, %rbx C 6 135L(ent): imul %r10, %rax C 6 136 mov %rax, (%rdi,%r8,8) C 137 inc %r8 C 138 jnz L(top) 139 140 mul %r11 C carry limb in rdx 141 mov -8(%rsi), %rax C up high limb 142 shr R8(%rcx), %rax 143 sub %rbx, %rax C apply carry bit 144 sub %rdx, %rax C apply carry limb 145 imul %r10, %rax 146 mov %rax, (%rdi) 147 pop %rbx 148 FUNC_EXIT() 149 ret 150 151L(one): shr R8(%rcx), %rax 152 imul %r10, %rax 153 mov %rax, (%rdi) 154 pop %rbx 155 FUNC_EXIT() 156 ret 157 158EPILOGUE() 159