xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/dive_1.asm (revision ca453df649ce9db45b64d73678ba06cbccf9aa11)
1dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P4: 19.0 cycles/limb
24
25
26C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                      mp_limb_t divisor);
28C
29C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
30C being on the dependent chain and there being plenty of cycles available,
31C using an unaligned movq on every second iteration measured about 23 c/l.
32C
33C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
34C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
35C that might be hidden by out-of-order execution, whereas divl is around 60.
36C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
37C faster.
38
39defframe(PARAM_DIVISOR,16)
40defframe(PARAM_SIZE,   12)
41defframe(PARAM_SRC,    8)
42defframe(PARAM_DST,    4)
43
44	TEXT
45
46	ALIGN(16)
47PROLOGUE(mpn_divexact_1)
48deflit(`FRAME',0)
49
50	movl	PARAM_SIZE, %edx
51
52	movl	PARAM_SRC, %eax
53
54	movl	PARAM_DIVISOR, %ecx
55	subl	$1, %edx
56	jnz	L(two_or_more)
57
58	movl	(%eax), %eax
59	xorl	%edx, %edx
60
61	divl	%ecx
62	movl	PARAM_DST, %ecx
63
64	movl	%eax, (%ecx)
65	ret
66
67
68L(two_or_more):
69	C eax	src
70	C ebx
71	C ecx	divisor
72	C edx	size-1
73
74	movl	%ecx, %eax
75	bsfl	%ecx, %ecx		C trailing twos
76
77	shrl	%cl, %eax		C d = divisor without twos
78	movd	%eax, %mm6
79	movd	%ecx, %mm7		C shift
80
81	shrl	%eax			C d/2
82
83	andl	$127, %eax		C d/2, 7 bits
84
85ifdef(`PIC',`
86	LEA(	binvert_limb_table, %ecx)
87	movzbl	(%eax,%ecx), %eax		C inv 8 bits
88',`
89	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
90')
91
92	C
93
94	movd	%eax, %mm5		C inv
95
96	movd	%eax, %mm0		C inv
97
98	pmuludq	%mm5, %mm5		C inv*inv
99
100	C
101
102	pmuludq	%mm6, %mm5		C inv*inv*d
103	paddd	%mm0, %mm0		C 2*inv
104
105	C
106
107	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
108	pxor	%mm5, %mm5
109
110	paddd	%mm0, %mm5
111	pmuludq	%mm0, %mm0		C inv*inv
112
113	pcmpeqd	%mm4, %mm4
114	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
115
116	C
117
118	pmuludq	%mm6, %mm0		C inv*inv*d
119	paddd	%mm5, %mm5		C 2*inv
120
121	movl	PARAM_SRC, %eax
122	movl	PARAM_DST, %ecx
123	pxor	%mm1, %mm1		C initial carry limb
124
125	C
126
127	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
128
129	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
130	pushl	%eax	FRAME_pushl()
131	movq	%mm6, %mm0
132	pmuludq	%mm5, %mm0
133	movd	%mm0, %eax
134	cmpl	$1, %eax
135	popl	%eax	FRAME_popl()')
136
137	pxor	%mm0, %mm0		C initial carry bit
138
139
140C The dependent chain here is as follows.
141C
142C				        latency
143C	psubq	 s = (src-cbit) - climb    2
144C	pmuludq	 q = s*inverse             8
145C	pmuludq	 prod = q*divisor          8
146C	psrlq	 climb = high(prod)        2
147C	                                  --
148C	                                  20
149C
150C Yet the loop measures 19.0 c/l, so obviously there's something gained
151C there over a straight reading of the chip documentation.
152
153L(top):
154	C eax	src, incrementing
155	C ebx
156	C ecx	dst, incrementing
157	C edx	counter, size-1 iterations
158	C
159	C mm0	carry bit
160	C mm1	carry limb
161	C mm4	0x00000000FFFFFFFF
162	C mm5	inverse
163	C mm6	divisor
164	C mm7	shift
165
166	movd	(%eax), %mm2
167	movd	4(%eax), %mm3
168	addl	$4, %eax
169	punpckldq %mm3, %mm2
170
171	psrlq	%mm7, %mm2
172	pand	%mm4, %mm2		C src
173	psubq	%mm0, %mm2		C src - cbit
174
175	psubq	%mm1, %mm2		C src - cbit - climb
176	movq	%mm2, %mm0
177	psrlq	$63, %mm0		C new cbit
178
179	pmuludq	%mm5, %mm2		C s*inverse
180	movd	%mm2, (%ecx)		C q
181	addl	$4, %ecx
182
183	movq	%mm6, %mm1
184	pmuludq	%mm2, %mm1		C q*divisor
185	psrlq	$32, %mm1		C new climb
186
187	subl	$1, %edx
188	jnz	L(top)
189
190
191L(done):
192	movd	(%eax), %mm2
193	psrlq	%mm7, %mm2		C src
194	psubq	%mm0, %mm2		C src - cbit
195
196	psubq	%mm1, %mm2		C src - cbit - climb
197
198	pmuludq	%mm5, %mm2		C s*inverse
199	movd	%mm2, (%ecx)		C q
200
201	emms
202	ret
203
204EPILOGUE()
205