1dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb. 2 3dnl Copyright 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb (approximate) 23C POWER3/PPC630 80 24C POWER4/PPC970 86 25C POWER5 86 26C POWER6 170 27C POWER7 66 28 29ASM_START() 30PROLOGUE(mpn_invert_limb) 31 LEAL( r12, approx_tab) 32 srdi r9, r3, 32 33 rlwinm r9, r9, 10, 23, 30 C (d >> 55) & 0x1fe 34 srdi r10, r3, 24 C d >> 24 35 lis r11, 0x1000 36 rldicl r8, r3, 0, 63 C d mod 2 37 addi r10, r10, 1 C d40 38 sldi r11, r11, 32 C 2^60 39 srdi r7, r3, 1 C d/2 40 add r7, r7, r8 C d63 = ceil(d/2) 41 neg r8, r8 C mask = -(d mod 2) 42 lhzx r0, r9, r12 43 mullw r9, r0, r0 C v0*v0 44 sldi r6, r0, 11 C v0 << 11 45 addi r0, r6, -1 C (v0 << 11) - 1 46 mulld r9, r9, r10 C v0*v0*d40 47 srdi r9, r9, 40 C v0*v0*d40 >> 40 48 subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 49 mulld r0, r9, r10 C v1*d40 50 sldi r6, r9, 13 C v1 << 13 51 subf r0, r0, r11 C 2^60 - v1*d40 52 mulld r0, r0, r9 C v1 * (2^60 - v1*d40) 53 srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47 54 add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) 55 mulld r11, r0, r7 C v2 * d63 56 srdi r10, r0, 1 C v2 >> 1 57 sldi r9, r0, 31 C v2 << 31 58 and r8, r10, r8 C (v2 >> 1) & mask 59 subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63 60 mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63) 61 srdi r0, r0, 1 C p1 >> 1 62 add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1) 63 nop 64 mulhdu r9, r0, r3 65 mulld r11, r0, r3 66 addc r10, r11, r3 67 adde r3, r9, r3 68 subf r3, r3, r0 69 blr 70EPILOGUE() 71 72DEF_OBJECT(approx_tab) 73 .short 0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6 74 .short 0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b 75 .short 0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754 76 .short 0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720 77 .short 0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee 78 .short 0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf 79 .short 0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693 80 .short 0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669 81 .short 0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640 82 .short 0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a 83 .short 0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6 84 .short 0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3 85 .short 0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2 86 .short 0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592 87 .short 0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574 88 .short 0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556 89 .short 0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a 90 .short 0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520 91 .short 0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506 92 .short 0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed 93 .short 0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5 94 .short 0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be 95 .short 0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8 96 .short 0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493 97 .short 0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e 98 .short 0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a 99 .short 0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457 100 .short 0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444 101 .short 0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432 102 .short 0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421 103 .short 0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410 104 .short 0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400 105END_OBJECT(approx_tab) 106ASM_END() 107