1dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C K8,K9: 10 25C K10: 10 26C P4: 33 27C P6 core2: 13.25 28C P6 corei7: 14 29C P6 atom: 42 30 31C A quick adoption of the 32-bit K7 code. 32 33 34C INPUT PARAMETERS 35C rp rdi 36C up rsi 37C n rdx 38C divisor rcx 39 40ASM_START() 41 TEXT 42 ALIGN(16) 43PROLOGUE(mpn_divexact_1) 44 push %rbx 45 46 mov %rcx, %rax 47 xor R32(%rcx), R32(%rcx) C shift count 48 mov %rdx, %r8 49 50 bt $0, R32(%rax) 51 jnc L(evn) C skip bsfq unless divisor is even 52 53L(odd): mov %rax, %rbx 54 shr R32(%rax) 55 and $127, R32(%rax) C d/2, 7 bits 56 57ifdef(`PIC',` 58 mov binvert_limb_table@GOTPCREL(%rip), %rdx 59',` 60 movabs $binvert_limb_table, %rdx 61') 62 63 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 64 65 mov %rbx, %r11 C d without twos 66 67 lea (%rax,%rax), R32(%rdx) C 2*inv 68 imul R32(%rax), R32(%rax) C inv*inv 69 imul R32(%rbx), R32(%rax) C inv*inv*d 70 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 71 72 lea (%rdx,%rdx), R32(%rax) C 2*inv 73 imul R32(%rdx), R32(%rdx) C inv*inv 74 imul R32(%rbx), R32(%rdx) C inv*inv*d 75 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 76 77 lea (%rax,%rax), %r10 C 2*inv 78 imul %rax, %rax C inv*inv 79 imul %rbx, %rax C inv*inv*d 80 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 81 82 lea (%rsi,%r8,8), %rsi C up end 83 lea -8(%rdi,%r8,8), %rdi C rp end 84 neg %r8 C -n 85 86 mov (%rsi,%r8,8), %rax C up[0] 87 88 inc %r8 89 jz L(one) 90 91 mov (%rsi,%r8,8), %rdx C up[1] 92 93 shrd R8(%rcx), %rdx, %rax 94 95 xor R32(%rbx), R32(%rbx) 96 jmp L(ent) 97 98L(evn): bsf %rax, %rcx 99 shr R8(%rcx), %rax 100 jmp L(odd) 101 102 ALIGN(8) 103L(top): 104 C rax q 105 C rbx carry bit, 0 or 1 106 C rcx shift 107 C rdx 108 C rsi up end 109 C rdi rp end 110 C r8 counter, limbs, negative 111 C r10 d^(-1) mod 2^64 112 C r11 d, shifted down 113 114 mul %r11 C carry limb in rdx 0 10 115 mov -8(%rsi,%r8,8), %rax C 116 mov (%rsi,%r8,8), %r9 C 117 shrd R8(%rcx), %r9, %rax C 118 nop C 119 sub %rbx, %rax C apply carry bit 120 setc %bl C 121 sub %rdx, %rax C apply carry limb 5 122 adc $0, %rbx C 6 123L(ent): imul %r10, %rax C 6 124 mov %rax, (%rdi,%r8,8) C 125 inc %r8 C 126 jnz L(top) 127 128 mul %r11 C carry limb in rdx 129 mov -8(%rsi), %rax C up high limb 130 shr R8(%rcx), %rax 131 sub %rbx, %rax C apply carry bit 132 sub %rdx, %rax C apply carry limb 133 imul %r10, %rax 134 mov %rax, (%rdi) 135 pop %rbx 136 ret 137 138L(one): shr R8(%rcx), %rax 139 imul %r10, %rax 140 mov %rax, (%rdi) 141 pop %rbx 142 ret 143 144EPILOGUE() 145