xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/dive_1.asm (revision 2718af68c3efc72c9769069b5c7f9ed36f6b9def)
1dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P4: 19.0 cycles/limb
35
36
37C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                      mp_limb_t divisor);
39C
40C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
41C being on the dependent chain and there being plenty of cycles available,
42C using an unaligned movq on every second iteration measured about 23 c/l.
43C
44C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
45C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
46C that might be hidden by out-of-order execution, whereas divl is around 60.
47C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
48C faster.
49
50defframe(PARAM_DIVISOR,16)
51defframe(PARAM_SIZE,   12)
52defframe(PARAM_SRC,    8)
53defframe(PARAM_DST,    4)
54
55	TEXT
56
57	ALIGN(16)
58PROLOGUE(mpn_divexact_1)
59deflit(`FRAME',0)
60
61	movl	PARAM_SIZE, %edx
62
63	movl	PARAM_SRC, %eax
64
65	movl	PARAM_DIVISOR, %ecx
66	subl	$1, %edx
67	jnz	L(two_or_more)
68
69	movl	(%eax), %eax
70	xorl	%edx, %edx
71
72	divl	%ecx
73	movl	PARAM_DST, %ecx
74
75	movl	%eax, (%ecx)
76	ret
77
78
79L(two_or_more):
80	C eax	src
81	C ebx
82	C ecx	divisor
83	C edx	size-1
84
85	movl	%ecx, %eax
86	bsfl	%ecx, %ecx		C trailing twos
87
88	shrl	%cl, %eax		C d = divisor without twos
89	movd	%eax, %mm6
90	movd	%ecx, %mm7		C shift
91
92	shrl	%eax			C d/2
93
94	andl	$127, %eax		C d/2, 7 bits
95
96ifdef(`PIC',`
97	LEA(	binvert_limb_table, %ecx)
98	movzbl	(%eax,%ecx), %eax		C inv 8 bits
99',`
100	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
101')
102
103	C
104
105	movd	%eax, %mm5		C inv
106
107	movd	%eax, %mm0		C inv
108
109	pmuludq	%mm5, %mm5		C inv*inv
110
111	C
112
113	pmuludq	%mm6, %mm5		C inv*inv*d
114	paddd	%mm0, %mm0		C 2*inv
115
116	C
117
118	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
119	pxor	%mm5, %mm5
120
121	paddd	%mm0, %mm5
122	pmuludq	%mm0, %mm0		C inv*inv
123
124	pcmpeqd	%mm4, %mm4
125	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
126
127	C
128
129	pmuludq	%mm6, %mm0		C inv*inv*d
130	paddd	%mm5, %mm5		C 2*inv
131
132	movl	PARAM_SRC, %eax
133	movl	PARAM_DST, %ecx
134	pxor	%mm1, %mm1		C initial carry limb
135
136	C
137
138	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
139
140	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
141	pushl	%eax	FRAME_pushl()
142	movq	%mm6, %mm0
143	pmuludq	%mm5, %mm0
144	movd	%mm0, %eax
145	cmpl	$1, %eax
146	popl	%eax	FRAME_popl()')
147
148	pxor	%mm0, %mm0		C initial carry bit
149
150
151C The dependent chain here is as follows.
152C
153C					latency
154C	psubq	 s = (src-cbit) - climb	   2
155C	pmuludq	 q = s*inverse		   8
156C	pmuludq	 prod = q*divisor	   8
157C	psrlq	 climb = high(prod)	   2
158C					  --
159C					  20
160C
161C Yet the loop measures 19.0 c/l, so obviously there's something gained
162C there over a straight reading of the chip documentation.
163
164L(top):
165	C eax	src, incrementing
166	C ebx
167	C ecx	dst, incrementing
168	C edx	counter, size-1 iterations
169	C
170	C mm0	carry bit
171	C mm1	carry limb
172	C mm4	0x00000000FFFFFFFF
173	C mm5	inverse
174	C mm6	divisor
175	C mm7	shift
176
177	movd	(%eax), %mm2
178	movd	4(%eax), %mm3
179	addl	$4, %eax
180	punpckldq %mm3, %mm2
181
182	psrlq	%mm7, %mm2
183	pand	%mm4, %mm2		C src
184	psubq	%mm0, %mm2		C src - cbit
185
186	psubq	%mm1, %mm2		C src - cbit - climb
187	movq	%mm2, %mm0
188	psrlq	$63, %mm0		C new cbit
189
190	pmuludq	%mm5, %mm2		C s*inverse
191	movd	%mm2, (%ecx)		C q
192	addl	$4, %ecx
193
194	movq	%mm6, %mm1
195	pmuludq	%mm2, %mm1		C q*divisor
196	psrlq	$32, %mm1		C new climb
197
198	subl	$1, %edx
199	jnz	L(top)
200
201
202L(done):
203	movd	(%eax), %mm2
204	psrlq	%mm7, %mm2		C src
205	psubq	%mm0, %mm2		C src - cbit
206
207	psubq	%mm1, %mm2		C src - cbit - climb
208
209	pmuludq	%mm5, %mm2		C s*inverse
210	movd	%mm2, (%ecx)		C q
211
212	emms
213	ret
214
215EPILOGUE()
216ASM_END()
217