1dnl AMD64 mpn_invert_limb -- Invert a normalized limb. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and Niels M�ller. 4 5dnl Copyright 2004, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, 6dnl Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25 26C cycles/limb (approx) div 27C AMD K8,K9 48 71 28C AMD K10 48 77 29C Intel P4 135 161 30C Intel core2 69 116 31C Intel corei 55 89 32C Intel atom 129 191 33C VIA nano 79 157 34 35C rax rcx rdx rdi rsi r8 36 37ABI_SUPPORT(DOS64) 38ABI_SUPPORT(STD64) 39 40PROTECT(`mpn_invert_limb_table') 41 42ASM_START() 43 TEXT 44 ALIGN(16) 45PROLOGUE(mpn_invert_limb) C Kn C2 Ci 46 FUNC_ENTRY(1) 47 mov %rdi, %rax C 0 0 0 48 shr $55, %rax C 1 1 1 49ifdef(`PIC',` 50ifdef(`DARWIN',` 51 mov mpn_invert_limb_table@GOTPCREL(%rip), %r8 52 add $-512, %r8 53',` 54 lea -512+mpn_invert_limb_table(%rip), %r8 55')',` 56 movabs $-512+mpn_invert_limb_table, %r8 57') 58 movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0 59 60 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 61 mov %rdi, %rsi C 0 0 0 62 mov R32(%rcx), R32(%rax) C 4 5 5 63 imul R32(%rcx), R32(%rcx) C 4 5 5 64 shr $24, %rsi C 1 1 1 65 inc %rsi C %rsi = d40 66 imul %rsi, %rcx C 8 10 8 67 shr $40, %rcx C 12 15 11 68 sal $11, R32(%rax) C 5 6 6 69 dec R32(%rax) 70 sub R32(%rcx), R32(%rax) C %rax = v1 71 72 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) 73 mov $0x1000000000000000, %rcx 74 imul %rax, %rsi C 14 17 13 75 sub %rsi, %rcx 76 imul %rax, %rcx 77 sal $13, %rax 78 shr $47, %rcx 79 add %rax, %rcx C %rcx = v2 80 81 C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65 82 mov %rdi, %rsi C 0 0 0 83 shr %rsi C d/2 84 sbb %rax, %rax C -d0 = -(d mod 2) 85 sub %rax, %rsi C d63 = ceil(d/2) 86 imul %rcx, %rsi C v2 * d63 87 and %rcx, %rax C v2 * d0 88 shr %rax C (v2>>1) * d0 89 sub %rsi, %rax C (v2>>1) * d0 - v2 * d63 90 mul %rcx 91 sal $31, %rcx 92 shr %rdx 93 add %rdx, %rcx C %rcx = v3 94 95 mov %rdi, %rax 96 mul %rcx 97 add %rdi, %rax 98 mov %rcx, %rax 99 adc %rdi, %rdx 100 sub %rdx, %rax 101 102 FUNC_EXIT() 103 ret 104EPILOGUE() 105ASM_END() 106