1dnl AMD64 mpn_invert_limb -- Invert a normalized limb. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. 4 5dnl Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb (approx) div 37C AMD K8,K9 48 71 38C AMD K10 48 77 39C Intel P4 135 161 40C Intel core2 69 116 41C Intel corei 55 89 42C Intel atom 129 191 43C VIA nano 79 157 44 45C rax rcx rdx rdi rsi r8 46 47ABI_SUPPORT(DOS64) 48ABI_SUPPORT(STD64) 49 50PROTECT(`mpn_invert_limb_table') 51 52ASM_START() 53 TEXT 54 ALIGN(16) 55PROLOGUE(mpn_invert_limb) C Kn C2 Ci 56 FUNC_ENTRY(1) 57 mov %rdi, %rax C 0 0 0 58 shr $55, %rax C 1 1 1 59ifdef(`PIC',` 60ifdef(`DARWIN',` 61 mov mpn_invert_limb_table@GOTPCREL(%rip), %r8 62 add $-512, %r8 63',` 64 lea -512+mpn_invert_limb_table(%rip), %r8 65')',` 66 movabs $-512+mpn_invert_limb_table, %r8 67') 68 movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0 69 70 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 71 mov %rdi, %rsi C 0 0 0 72 mov R32(%rcx), R32(%rax) C 4 5 5 73 imul R32(%rcx), R32(%rcx) C 4 5 5 74 shr $24, %rsi C 1 1 1 75 inc %rsi C %rsi = d40 76 imul %rsi, %rcx C 8 10 8 77 shr $40, %rcx C 12 15 11 78 sal $11, R32(%rax) C 5 6 6 79 dec R32(%rax) 80 sub R32(%rcx), R32(%rax) C %rax = v1 81 82 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) 83 mov $0x1000000000000000, %rcx 84 imul %rax, %rsi C 14 17 13 85 sub %rsi, %rcx 86 imul %rax, %rcx 87 sal $13, %rax 88 shr $47, %rcx 89 add %rax, %rcx C %rcx = v2 90 91 C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65 92 mov %rdi, %rsi C 0 0 0 93 shr %rsi C d/2 94 sbb %rax, %rax C -d0 = -(d mod 2) 95 sub %rax, %rsi C d63 = ceil(d/2) 96 imul %rcx, %rsi C v2 * d63 97 and %rcx, %rax C v2 * d0 98 shr %rax C (v2>>1) * d0 99 sub %rsi, %rax C (v2>>1) * d0 - v2 * d63 100 mul %rcx 101 sal $31, %rcx 102 shr %rdx 103 add %rdx, %rcx C %rcx = v3 104 105 mov %rdi, %rax 106 mul %rcx 107 add %rdi, %rax 108 mov %rcx, %rax 109 adc %rdi, %rdx 110 sub %rdx, %rax 111 112 FUNC_EXIT() 113 ret 114EPILOGUE() 115ASM_END() 116