1dnl AMD64 mpn_invert_limb -- Invert a normalized limb. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and Niels M�ller. 4 5dnl Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb (approx) div 26C K8,K9: 48 71 27C K10: 48 77 28C P4: 135 161 29C P6 core2: 69 116 30C P6 corei7: 55 89 31C P6 atom: 129 191 32 33C rax rcx rdx rdi rsi r8 34 35 36ASM_START() 37 TEXT 38 ALIGN(16) 39PROLOGUE(mpn_invert_limb) C Kn C2 Ci 40 mov %rdi, %rax C 0 0 0 41 shr $55, %rax C 1 1 1 42ifdef(`PIC',` 43ifdef(`DARWIN',` 44 mov approx_tab@GOTPCREL(%rip), %r8 45 add $-512, %r8 46',` 47 lea -512+approx_tab(%rip), %r8 48')',` 49 movabs $-512+approx_tab, %r8 50') 51 movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0 52 53 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 54 mov %rdi, %rsi C 0 0 0 55 mov R32(%rcx), R32(%rax) C 4 5 5 56 imul R32(%rcx), R32(%rcx) C 4 5 5 57 shr $24, %rsi C 1 1 1 58 inc %rsi C %rsi = d40 59 imul %rsi, %rcx C 8 10 8 60 shr $40, %rcx C 12 15 11 61 sal $11, R32(%rax) C 5 6 6 62 dec R32(%rax) 63 sub R32(%rcx), R32(%rax) C %rax = v1 64 65 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47 66 mov $0x1000000000000000, %rcx 67 imul %rax, %rsi C 14 17 13 68 sub %rsi, %rcx 69 imul %rax, %rcx 70 sal $13, %rax 71 shr $47, %rcx 72 add %rax, %rcx C %rcx = v2 73 74 C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65 75 mov %rdi, %rsi C 0 0 0 76 shr $1, %rsi C d/2 77 sbb %rax, %rax C -d0 = -(d mod 2) 78 sub %rax, %rsi C d63 = ceil(d/2) 79 imul %rcx, %rsi C v2 * d63 80 and %rcx, %rax C v2 * d0 81 shr $1, %rax C (v2>>1) * d0 82 sub %rsi, %rax C (v2>>1) * d0 - v2 * d63 83 mul %rcx 84 sal $31, %rcx 85 shr $1, %rdx 86 add %rdx, %rcx C %rcx = v3 87 88 mov %rdi, %rax 89 mul %rcx 90 add %rdi, %rax 91 mov %rcx, %rax 92 adc %rdi, %rdx 93 sub %rdx, %rax 94 95 ret 96EPILOGUE() 97 98 RODATA 99 ALIGN(2) 100approx_tab: 101 .value 0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6 102 .value 0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b 103 .value 0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754 104 .value 0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720 105 .value 0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee 106 .value 0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf 107 .value 0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693 108 .value 0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669 109 .value 0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640 110 .value 0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a 111 .value 0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6 112 .value 0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3 113 .value 0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2 114 .value 0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592 115 .value 0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574 116 .value 0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556 117 .value 0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a 118 .value 0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520 119 .value 0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506 120 .value 0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed 121 .value 0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5 122 .value 0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be 123 .value 0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8 124 .value 0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493 125 .value 0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e 126 .value 0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a 127 .value 0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457 128 .value 0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444 129 .value 0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432 130 .value 0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421 131 .value 0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410 132 .value 0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400 133ASM_END() 134