xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/invert_limb.asm (revision 8450a7c42673d65e3b1f6560d3b6ecd317a6cbe8)
1dnl  PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
2
3dnl  Copyright 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C                  cycles/limb (approximate)
23C POWER3/PPC630         80
24C POWER4/PPC970         86
25C POWER5                86
26C POWER6               170
27C POWER7                66
28
29ASM_START()
30PROLOGUE(mpn_invert_limb)
31	LEAL(	r12, approx_tab)
32	srdi	r9, r3, 32
33	rlwinm	r9, r9, 10, 23, 30	C (d >> 55) & 0x1fe
34	srdi	r10, r3, 24		C d >> 24
35	lis	r11, 0x1000
36	rldicl	r8, r3, 0, 63		C d mod 2
37	addi	r10, r10, 1		C d40
38	sldi	r11, r11, 32		C 2^60
39	srdi	r7, r3, 1		C d/2
40	add	r7, r7, r8		C d63 = ceil(d/2)
41	neg	r8, r8			C mask = -(d mod 2)
42	lhzx	r0, r9, r12
43	mullw	r9, r0, r0		C v0*v0
44	sldi	r6, r0, 11		C v0 << 11
45	addi	r0, r6, -1		C (v0 << 11) - 1
46	mulld	r9, r9, r10		C v0*v0*d40
47	srdi	r9, r9, 40		C v0*v0*d40 >> 40
48	subf	r9, r9, r0		C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
49	mulld	r0, r9, r10		C v1*d40
50	sldi	r6, r9, 13		C v1 << 13
51	subf	r0, r0, r11		C 2^60 - v1*d40
52	mulld	r0, r0, r9		C v1 * (2^60 - v1*d40)
53	srdi	r0, r0, 47		C v1 * (2^60 - v1*d40) >> 47
54	add	r0, r0, r6		C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
55	mulld	r11, r0, r7		C v2 * d63
56	srdi	r10, r0, 1		C v2 >> 1
57	sldi	r9, r0, 31		C v2 << 31
58	and	r8, r10, r8		C (v2 >> 1) & mask
59	subf	r8, r11, r8		C ((v2 >> 1) & mask) - v2 * d63
60	mulhdu	r0, r8, r0		C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
61	srdi	r0, r0, 1		C p1 >> 1
62	add	r0, r0, r9		C v3 = (v2 << 31) + (p1 >> 1)
63	nop
64	mulhdu	r9, r0, r3
65	mulld	r11, r0, r3
66	addc	r10, r11, r3
67	adde	r3, r9, r3
68	subf	r3, r3, r0
69	blr
70EPILOGUE()
71
72DEF_OBJECT(approx_tab)
73        .short  0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
74        .short  0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
75        .short  0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
76        .short  0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
77        .short  0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
78        .short  0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
79        .short  0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
80        .short  0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
81        .short  0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
82        .short  0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
83        .short  0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
84        .short  0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
85        .short  0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
86        .short  0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
87        .short  0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
88        .short  0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
89        .short  0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
90        .short  0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
91        .short  0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
92        .short  0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
93        .short  0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
94        .short  0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
95        .short  0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
96        .short  0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
97        .short  0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
98        .short  0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
99        .short  0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
100        .short  0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
101        .short  0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
102        .short  0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
103        .short  0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
104        .short  0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
105END_OBJECT(approx_tab)
106ASM_END()
107