1dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C norm unorm 36C AMD K8,K9 11 11 37C AMD K10 11 11 38C Intel P4 ? 39C Intel core2 13.5 13.25 40C Intel corei 14.25 41C Intel atom 34 36 42C VIA nano 19.25 19.25 43 44 45C INPUT PARAMETERS 46C rp rdi 47C up rsi 48C n rdx 49C divisor rcx 50 51ABI_SUPPORT(DOS64) 52ABI_SUPPORT(STD64) 53 54ASM_START() 55 TEXT 56 ALIGN(16) 57PROLOGUE(mpn_divexact_1) 58 FUNC_ENTRY(4) 59 push %rbx 60 61 mov %rcx, %rax 62 xor R32(%rcx), R32(%rcx) C shift count 63 mov %rdx, %r8 64 65 bt $0, R32(%rax) 66 jc L(odd) C skip bsfq unless divisor is even 67 bsf %rax, %rcx 68 shr R8(%rcx), %rax 69L(odd): mov %rax, %rbx 70 shr R32(%rax) 71 and $127, R32(%rax) C d/2, 7 bits 72 73 LEA( binvert_limb_table, %rdx) 74 75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 76 77 mov %rbx, %r11 C d without twos 78 79 lea (%rax,%rax), R32(%rdx) C 2*inv 80 imul R32(%rax), R32(%rax) C inv*inv 81 imul R32(%rbx), R32(%rax) C inv*inv*d 82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 83 84 lea (%rdx,%rdx), R32(%rax) C 2*inv 85 imul R32(%rdx), R32(%rdx) C inv*inv 86 imul R32(%rbx), R32(%rdx) C inv*inv*d 87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 88 89 lea (%rax,%rax), %r10 C 2*inv 90 imul %rax, %rax C inv*inv 91 imul %rbx, %rax C inv*inv*d 92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 93 94 lea (%rsi,%r8,8), %rsi C up end 95 lea -8(%rdi,%r8,8), %rdi C rp end 96 neg %r8 C -n 97 98 mov (%rsi,%r8,8), %rax C up[0] 99 100 inc %r8 101 jz L(one) 102 103 test R32(%rcx), R32(%rcx) 104 jnz L(unorm) C branch if count != 0 105 xor R32(%rbx), R32(%rbx) 106 jmp L(nent) 107 108 ALIGN(8) 109L(ntop):mul %r11 C carry limb in rdx 0 10 110 mov -8(%rsi,%r8,8), %rax C 111 sub %rbx, %rax C apply carry bit 112 setc %bl C 113 sub %rdx, %rax C apply carry limb 5 114 adc $0, %rbx C 6 115L(nent):imul %r10, %rax C 6 116 mov %rax, (%rdi,%r8,8) C 117 inc %r8 C 118 jnz L(ntop) 119 120 mov -8(%rsi), %r9 C up high limb 121 jmp L(com) 122 123L(unorm): 124 mov (%rsi,%r8,8), %r9 C up[1] 125 shr R8(%rcx), %rax C 126 neg R32(%rcx) 127 shl R8(%rcx), %r9 C 128 neg R32(%rcx) 129 or %r9, %rax 130 xor R32(%rbx), R32(%rbx) 131 jmp L(uent) 132 133 ALIGN(8) 134L(utop):mul %r11 C carry limb in rdx 0 10 135 mov (%rsi,%r8,8), %rax C 136 shl R8(%rcx), %rax C 137 neg R32(%rcx) 138 or %r9, %rax 139 sub %rbx, %rax C apply carry bit 140 setc %bl C 141 sub %rdx, %rax C apply carry limb 5 142 adc $0, %rbx C 6 143L(uent):imul %r10, %rax C 6 144 mov (%rsi,%r8,8), %r9 C 145 shr R8(%rcx), %r9 C 146 neg R32(%rcx) 147 mov %rax, (%rdi,%r8,8) C 148 inc %r8 C 149 jnz L(utop) 150 151L(com): mul %r11 C carry limb in rdx 152 sub %rbx, %r9 C apply carry bit 153 sub %rdx, %r9 C apply carry limb 154 imul %r10, %r9 155 mov %r9, (%rdi) 156 pop %rbx 157 FUNC_EXIT() 158 ret 159 160L(one): shr R8(%rcx), %rax 161 imul %r10, %rax 162 mov %rax, (%rdi) 163 pop %rbx 164 FUNC_EXIT() 165 ret 166EPILOGUE() 167