xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/invert_limb.asm (revision c5e820cae412164fcbee52f470436200af5358ea)
1dnl  AMD64 mpn_invert_limb -- Invert a normalized limb.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund and Niels Mller.
4
5dnl  Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb (approx)	div
26C K8,K9:	 48			 71
27C K10:		 48			 77
28C P4:	        135			161
29C P6 core2:	 69			116
30C P6 corei7:	 55			 89
31C P6 atom:	129			191
32
33C rax rcx rdx rdi rsi r8
34
35
36ASM_START()
37	TEXT
38	ALIGN(16)
39PROLOGUE(mpn_invert_limb)		C			Kn	C2	Ci
40	mov	%rdi, %rax		C			 0	 0	 0
41	shr	$55, %rax		C			 1	 1	 1
42ifdef(`PIC',`
43ifdef(`DARWIN',`
44	mov	approx_tab@GOTPCREL(%rip), %r8
45	add	$-512, %r8
46',`
47	lea	-512+approx_tab(%rip), %r8
48')',`
49	movabs	$-512+approx_tab, %r8
50')
51	movzwl	(%r8,%rax,2), R32(%rcx)	C	%rcx = v0
52
53	C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
54	mov	%rdi, %rsi		C			 0	 0	 0
55	mov	R32(%rcx), R32(%rax)	C			 4	 5	 5
56	imul	R32(%rcx), R32(%rcx)	C			 4	 5	 5
57	shr	$24, %rsi		C			 1	 1	 1
58	inc	%rsi			C	%rsi = d40
59	imul	%rsi, %rcx		C			 8	10	 8
60	shr	$40, %rcx		C			12	15	11
61	sal	$11, R32(%rax)		C			 5	 6	 6
62	dec	R32(%rax)
63	sub	R32(%rcx), R32(%rax)	C	%rax = v1
64
65	C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47
66	mov	$0x1000000000000000, %rcx
67	imul	%rax, %rsi		C			14	17	13
68	sub	%rsi, %rcx
69	imul	%rax, %rcx
70	sal	$13, %rax
71	shr	$47, %rcx
72	add	%rax, %rcx		C	%rcx = v2
73
74	C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65
75	mov	%rdi, %rsi		C			 0	 0	 0
76	shr	$1, %rsi		C d/2
77	sbb	%rax, %rax		C -d0 = -(d mod 2)
78	sub	%rax, %rsi		C d63 = ceil(d/2)
79	imul	%rcx, %rsi		C v2 * d63
80	and	%rcx, %rax		C v2 * d0
81	shr	$1, %rax		C (v2>>1) * d0
82	sub	%rsi, %rax		C (v2>>1) * d0 - v2 * d63
83	mul	%rcx
84	sal	$31, %rcx
85	shr	$1, %rdx
86	add	%rdx, %rcx		C	%rcx = v3
87
88	mov	%rdi, %rax
89	mul	%rcx
90	add	%rdi, %rax
91	mov	%rcx, %rax
92	adc	%rdi, %rdx
93	sub	%rdx, %rax
94
95	ret
96EPILOGUE()
97
98	RODATA
99	ALIGN(2)
100approx_tab:
101	.value	0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
102	.value	0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
103	.value	0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
104	.value	0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
105	.value	0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
106	.value	0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
107	.value	0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
108	.value	0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
109	.value	0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
110	.value	0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
111	.value	0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
112	.value	0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
113	.value	0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
114	.value	0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
115	.value	0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
116	.value	0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
117	.value	0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
118	.value	0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
119	.value	0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
120	.value	0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
121	.value	0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
122	.value	0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
123	.value	0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
124	.value	0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
125	.value	0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
126	.value	0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
127	.value	0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
128	.value	0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
129	.value	0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
130	.value	0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
131	.value	0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
132	.value	0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
133ASM_END()
134