1dnl AMD64 mpn_invert_limb -- Invert a normalized limb. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. 4 5dnl Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb (approx) div 37C AMD K8,K9 48 71 38C AMD K10 48 77 39C Intel P4 135 161 40C Intel core2 69 116 41C Intel corei 55 89 42C Intel atom 129 191 43C VIA nano 79 157 44 45C rax rcx rdx rdi rsi r8 46 47ABI_SUPPORT(DOS64) 48ABI_SUPPORT(STD64) 49 50PROTECT(`mpn_invert_limb_table') 51 52ASM_START() 53 TEXT 54 ALIGN(16) 55PROLOGUE(mpn_invert_limb) C Kn C2 Ci 56 FUNC_ENTRY(1) 57 mov %rdi, %rax C 0 0 0 58 shr $55, %rax C 1 1 1 59ifdef(`DARWIN',` 60 lea mpn_invert_limb_table(%rip), %r8 61 add $-512, %r8 62',` 63 lea -512+mpn_invert_limb_table(%rip), %r8 64') 65 movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0 66 67 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 68 mov %rdi, %rsi C 0 0 0 69 mov R32(%rcx), R32(%rax) C 4 5 5 70 imul R32(%rcx), R32(%rcx) C 4 5 5 71 shr $24, %rsi C 1 1 1 72 inc %rsi C %rsi = d40 73 imul %rsi, %rcx C 8 10 8 74 shr $40, %rcx C 12 15 11 75 sal $11, R32(%rax) C 5 6 6 76 dec R32(%rax) 77 sub R32(%rcx), R32(%rax) C %rax = v1 78 79 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) 80 mov $0x1000000000000000, %rcx 81 imul %rax, %rsi C 14 17 13 82 sub %rsi, %rcx 83 imul %rax, %rcx 84 sal $13, %rax 85 shr $47, %rcx 86 add %rax, %rcx C %rcx = v2 87 88 C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65) 89 mov %rdi, %rsi C 0 0 0 90 shr %rsi C d/2 91 sbb %rax, %rax C -d0 = -(d mod 2) 92 sub %rax, %rsi C d63 = ceil(d/2) 93 imul %rcx, %rsi C v2 * d63 94 and %rcx, %rax C v2 * d0 95 shr %rax C (v2>>1) * d0 96 sub %rsi, %rax C (v2>>1) * d0 - v2 * d63 97 mul %rcx 98 sal $31, %rcx 99 shr %rdx 100 add %rdx, %rcx C %rcx = v3 101 102 mov %rdi, %rax 103 mul %rcx 104 add %rdi, %rax 105 mov %rcx, %rax 106 adc %rdi, %rdx 107 sub %rdx, %rax 108 109 FUNC_EXIT() 110 ret 111EPILOGUE() 112ASM_END() 113