1dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by 2dnl 1-limb divisor, returning quotient only. 3 4dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation, 5dnl Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C K8,K9: 10 27C K10: 10 28C P4: 33 29C P6 core2: 13.25 30C P6 corei7: 14 31C P6 atom: 42 32 33 34C INPUT PARAMETERS 35C rp rdi 36C up rsi 37C n rdx 38C d rcx 39C di r8 just mpn_pi1_bdiv_q_1 40C shift r9 just mpn_pi1_bdiv_q_1 41 42 43ASM_START() 44 TEXT 45 ALIGN(16) 46PROLOGUE(mpn_bdiv_q_1) 47 push %rbx 48 49 mov %rcx, %rax 50 xor R32(%rcx), R32(%rcx) C shift count 51 mov %rdx, %r10 52 53 bt $0, R32(%rax) 54 jnc L(evn) C skip bsfq unless divisor is even 55 56L(odd): mov %rax, %rbx 57 shr R32(%rax) 58 and $127, R32(%rax) C d/2, 7 bits 59 60ifdef(`PIC',` 61 mov binvert_limb_table@GOTPCREL(%rip), %rdx 62',` 63 movabs $binvert_limb_table, %rdx 64') 65 66 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 67 68 mov %rbx, %r11 C d without twos 69 70 lea (%rax,%rax), R32(%rdx) C 2*inv 71 imul R32(%rax), R32(%rax) C inv*inv 72 imul R32(%rbx), R32(%rax) C inv*inv*d 73 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 74 75 lea (%rdx,%rdx), R32(%rax) C 2*inv 76 imul R32(%rdx), R32(%rdx) C inv*inv 77 imul R32(%rbx), R32(%rdx) C inv*inv*d 78 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 79 80 lea (%rax,%rax), %r8 C 2*inv 81 imul %rax, %rax C inv*inv 82 imul %rbx, %rax C inv*inv*d 83 sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits 84 85 jmp L(com) 86 87L(evn): bsf %rax, %rcx 88 shr R8(%rcx), %rax 89 jmp L(odd) 90EPILOGUE() 91 92PROLOGUE(mpn_pi1_bdiv_q_1) 93 push %rbx 94 95 mov %rcx, %r11 C d 96 mov %rdx, %r10 C n 97 mov %r9, %rcx C shift 98L(com): 99 mov (%rsi), %rax C up[0] 100 101 dec %r10 102 jz L(one) 103 104 mov 8(%rsi), %rdx C up[1] 105 lea (%rsi,%r10,8), %rsi C up end 106 lea (%rdi,%r10,8), %rdi C rp end 107 neg %r10 C -n 108 109 shrd R8(%rcx), %rdx, %rax 110 111 xor R32(%rbx), R32(%rbx) 112 jmp L(ent) 113 114 ALIGN(8) 115L(top): 116 C rax q 117 C rbx carry bit, 0 or 1 118 C rcx shift 119 C rdx 120 C rsi up end 121 C rdi rp end 122 C r10 counter, limbs, negative 123 124 mul %r11 C carry limb in rdx 125 mov (%rsi,%r10,8), %rax 126 mov 8(%rsi,%r10,8), %r9 127 shrd R8(%rcx), %r9, %rax 128 nop 129 sub %rbx, %rax C apply carry bit 130 setc R8(%rbx) 131 sub %rdx, %rax C apply carry limb 132 adc $0, %rbx 133L(ent): imul %r8, %rax 134 mov %rax, (%rdi,%r10,8) 135 inc %r10 136 jnz L(top) 137 138 mul %r11 C carry limb in rdx 139 mov (%rsi), %rax C up high limb 140 shr R8(%rcx), %rax 141 sub %rbx, %rax C apply carry bit 142 sub %rdx, %rax C apply carry limb 143 imul %r8, %rax 144 mov %rax, (%rdi) 145 pop %rbx 146 ret 147 148L(one): shr R8(%rcx), %rax 149 imul %r8, %rax 150 mov %rax, (%rdi) 151 pop %rbx 152 ret 153EPILOGUE() 154