xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/mode1o.asm (revision d25ffa98a4bfca1fe272f3c182496ec9934faac7)
1dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P4: 19.0 cycles/limb
24
25
26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
27C                               mp_limb_t divisor);
28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29C                                mp_limb_t divisor, mp_limb_t carry);
30C
31
32defframe(PARAM_CARRY,  16)
33defframe(PARAM_DIVISOR,12)
34defframe(PARAM_SIZE,   8)
35defframe(PARAM_SRC,    4)
36
37	TEXT
38
39	ALIGN(16)
40PROLOGUE(mpn_modexact_1c_odd)
41deflit(`FRAME',0)
42
43	movd	PARAM_CARRY, %mm1
44	jmp	L(start_1c)
45
46EPILOGUE()
47
48
49	ALIGN(16)
50PROLOGUE(mpn_modexact_1_odd)
51deflit(`FRAME',0)
52
53	pxor	%mm1, %mm1		C carry limb
54L(start_1c):
55	movl	PARAM_DIVISOR, %eax
56
57	movd	PARAM_DIVISOR, %mm7
58
59	shrl	%eax
60
61	andl	$127, %eax		C d/2, 7 bits
62
63ifdef(`PIC',`
64	LEA(	binvert_limb_table, %edx)
65	movzbl	(%eax,%edx), %eax		C inv 8 bits
66',`
67	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
68')
69
70	C
71
72	movd	%eax, %mm6		C inv
73
74	movd	%eax, %mm0		C inv
75
76	pmuludq	%mm6, %mm6		C inv*inv
77
78	C
79
80	pmuludq	%mm7, %mm6		C inv*inv*d
81	paddd	%mm0, %mm0		C 2*inv
82
83	C
84
85	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
86	pxor	%mm6, %mm6
87
88	paddd	%mm0, %mm6
89	pmuludq	%mm0, %mm0		C inv*inv
90
91	C
92
93	pmuludq	%mm7, %mm0		C inv*inv*d
94	paddd	%mm6, %mm6		C 2*inv
95
96
97	movl	PARAM_SRC, %eax
98	movl	PARAM_SIZE, %ecx
99
100	C
101
102	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
103
104	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
105	pushl	%eax	FRAME_pushl()
106	movd	%mm6, %eax
107	imul	PARAM_DIVISOR, %eax
108	cmpl	$1, %eax
109	popl	%eax	FRAME_popl()')
110
111	pxor	%mm0, %mm0		C carry bit
112
113
114C The dependent chain here is as follows.
115C
116C				        latency
117C	psubq	 s = (src-cbit) - climb    2
118C	pmuludq	 q = s*inverse             8
119C	pmuludq	 prod = q*divisor          8
120C	psrlq	 climb = high(prod)        2
121C	                                  --
122C	                                  20
123C
124C Yet the loop measures 19.0 c/l, so obviously there's something gained
125C there over a straight reading of the chip documentation.
126
127L(top):
128	C eax	src, incrementing
129	C ebx
130	C ecx	counter, limbs
131	C edx
132	C
133	C mm0	carry bit
134	C mm1	carry limb
135	C mm6	inverse
136	C mm7	divisor
137
138	movd	(%eax), %mm2
139	addl	$4, %eax
140
141	psubq	%mm0, %mm2		C src - cbit
142
143	psubq	%mm1, %mm2		C src - cbit - climb
144	movq	%mm2, %mm0
145	psrlq	$63, %mm0		C new cbit
146
147	pmuludq	%mm6, %mm2		C s*inverse
148
149	movq	%mm7, %mm1
150	pmuludq	%mm2, %mm1		C q*divisor
151	psrlq	$32, %mm1		C new climb
152
153	subl	$1, %ecx
154	jnz	L(top)
155
156
157L(done):
158	paddq	%mm1, %mm0
159	movd	%mm0, %eax
160	emms
161	ret
162
163EPILOGUE()
164