xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/divrem_1.asm (revision 2f62cc9c12bc202c40224f32c879f81443fee079)
1dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 1999-2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
35
36
37C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38C                         mp_srcptr src, mp_size_t size,
39C                         mp_limb_t divisor);
40C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
41C                          mp_srcptr src, mp_size_t size,
42C                          mp_limb_t divisor, mp_limb_t carry);
43C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
44C                                mp_srcptr src, mp_size_t size,
45C                                mp_limb_t divisor, mp_limb_t inverse,
46C                                unsigned shift);
47C
48C Algorithm:
49C
50C The method and nomenclature follow part 8 of "Division by Invariant
51C Integers using Multiplication" by Granlund and Montgomery, reference in
52C gmp.texi.
53C
54C "m" is written for what is m' in the paper, and "d" for d_norm, which
55C won't cause any confusion since it's only the normalized divisor that's of
56C any use in the code.  "b" is written for 2^N, the size of a limb, N being
57C 32 here.
58C
59C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
60C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
61C us have just a psubq on the dependent chain.
62C
63C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
64C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
65C
66C Notes:
67C
68C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
69C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
70C carry, since in normal circumstances that will be a very rare event.
71C
72C The test for skipping a division is branch free (once size>=1 is tested).
73C The store to the destination high limb is 0 when a divide is skipped, or
74C if it's not skipped then a copy of the src high limb is stored.  The
75C latter is in case src==dst.
76C
77C There's a small bias towards expecting xsize==0, by having code for
78C xsize==0 in a straight line and xsize!=0 under forward jumps.
79C
80C Enhancements:
81C
82C The loop measures 32 cycles, but the dependent chain would suggest it
83C could be done with 30.  Not sure where to start looking for the extras.
84C
85C Alternatives:
86C
87C If the divisor is normalized (high bit set) then a division step can
88C always be skipped, since the high destination limb is always 0 or 1 in
89C that case.  It doesn't seem worth checking for this though, since it
90C probably occurs infrequently.
91
92
93dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
94dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
95dnl
96dnl  The inverse takes about 80-90 cycles to calculate, but after that the
97dnl  multiply is 32 c/l versus division at about 58 c/l.
98dnl
99dnl  At 4 limbs the div is a touch faster than the mul (and of course
100dnl  simpler), so start the mul from 5 limbs.
101
102deflit(MUL_THRESHOLD, 5)
103
104
105defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
106defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
107defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
108defframe(PARAM_DIVISOR,20)
109defframe(PARAM_SIZE,   16)
110defframe(PARAM_SRC,    12)
111defframe(PARAM_XSIZE,  8)
112defframe(PARAM_DST,    4)
113
114dnl  re-use parameter space
115define(SAVE_ESI,`PARAM_SIZE')
116define(SAVE_EBP,`PARAM_SRC')
117define(SAVE_EDI,`PARAM_DIVISOR')
118define(SAVE_EBX,`PARAM_DST')
119
120	TEXT
121
122	ALIGN(16)
123PROLOGUE(mpn_preinv_divrem_1)
124deflit(`FRAME',0)
125
126	movl	PARAM_SIZE, %ecx
127	xorl	%edx, %edx		C carry if can't skip a div
128
129	movl	%esi, SAVE_ESI
130	movl	PARAM_SRC, %esi
131
132	movl	%ebp, SAVE_EBP
133	movl	PARAM_DIVISOR, %ebp
134
135	movl	%edi, SAVE_EDI
136	movl	PARAM_DST, %edi
137
138	movl	-4(%esi,%ecx,4), %eax	C src high limb
139
140	movl	%ebx, SAVE_EBX
141	movl	PARAM_XSIZE, %ebx
142
143	movd	PARAM_PREINV_INVERSE, %mm4
144
145	movd	PARAM_PREINV_SHIFT, %mm7  C l
146	cmpl	%ebp, %eax		C high cmp divisor
147
148	cmovc(	%eax, %edx)		C high is carry if high<divisor
149	movd	%edx, %mm0		C carry
150
151	movd	%edx, %mm1		C carry
152	movl	$0, %edx
153
154	movd	%ebp, %mm5		C d
155	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
156					C (the latter in case src==dst)
157	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
158
159	movl	%edx, (%edi,%ecx,4)	C dst high limb
160	sbbl	$0, %ecx		C skip one division if high<divisor
161	movl	$32, %eax
162
163	subl	PARAM_PREINV_SHIFT, %eax
164	psllq	%mm7, %mm5		C d normalized
165	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
166	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
167
168	movd	%eax, %mm6		C 32-l
169	jmp	L(start_preinv)
170
171EPILOGUE()
172
173
174	ALIGN(16)
175PROLOGUE(mpn_divrem_1c)
176deflit(`FRAME',0)
177
178	movl	PARAM_CARRY, %edx
179
180	movl	PARAM_SIZE, %ecx
181
182	movl	%esi, SAVE_ESI
183	movl	PARAM_SRC, %esi
184
185	movl	%ebp, SAVE_EBP
186	movl	PARAM_DIVISOR, %ebp
187
188	movl	%edi, SAVE_EDI
189	movl	PARAM_DST, %edi
190
191	movl	%ebx, SAVE_EBX
192	movl	PARAM_XSIZE, %ebx
193
194	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
195	jmp	L(start_1c)
196
197EPILOGUE()
198
199
200	ALIGN(16)
201PROLOGUE(mpn_divrem_1)
202deflit(`FRAME',0)
203
204	movl	PARAM_SIZE, %ecx
205	xorl	%edx, %edx		C initial carry (if can't skip a div)
206
207	movl	%esi, SAVE_ESI
208	movl	PARAM_SRC, %esi
209
210	movl	%ebp, SAVE_EBP
211	movl	PARAM_DIVISOR, %ebp
212
213	movl	%edi, SAVE_EDI
214	movl	PARAM_DST, %edi
215
216	movl	%ebx, SAVE_EBX
217	movl	PARAM_XSIZE, %ebx
218	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
219
220	orl	%ecx, %ecx		C size
221	jz	L(no_skip_div)		C if size==0
222	movl	-4(%esi,%ecx,4), %eax	C src high limb
223
224	cmpl	%ebp, %eax		C high cmp divisor
225
226	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
227	movl	%edx, (%edi,%ecx,4)	C dst high limb
228
229	movl	$0, %edx
230	cmovc(	%eax, %edx)		C high is carry if high<divisor
231
232	sbbl	$0, %ecx		C size-1 if high<divisor
233L(no_skip_div):
234
235
236L(start_1c):
237	C eax
238	C ebx	xsize
239	C ecx	size
240	C edx	carry
241	C esi	src
242	C edi	&dst[xsize-1]
243	C ebp	divisor
244
245	leal	(%ebx,%ecx), %eax	C size+xsize
246	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
247	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]
248
249	cmpl	$MUL_THRESHOLD, %eax
250	jae	L(mul_by_inverse)
251
252
253	orl	%ecx, %ecx
254	jz	L(divide_no_integer)	C if size==0
255
256L(divide_integer):
257	C eax	scratch (quotient)
258	C ebx	xsize
259	C ecx	counter
260	C edx	carry
261	C esi	src, decrementing
262	C edi	dst, decrementing
263	C ebp	divisor
264
265	movl	(%esi), %eax
266	subl	$4, %esi
267
268	divl	%ebp
269
270	movl	%eax, (%edi)
271	subl	$4, %edi
272
273	subl	$1, %ecx
274	jnz	L(divide_integer)
275
276
277L(divide_no_integer):
278	orl	%ebx, %ebx
279	jnz	L(divide_fraction)	C if xsize!=0
280
281L(divide_done):
282	movl	SAVE_ESI, %esi
283	movl	SAVE_EDI, %edi
284	movl	SAVE_EBX, %ebx
285	movl	SAVE_EBP, %ebp
286	movl	%edx, %eax
287	ret
288
289
290L(divide_fraction):
291	C eax	scratch (quotient)
292	C ebx	counter
293	C ecx
294	C edx	carry
295	C esi
296	C edi	dst, decrementing
297	C ebp	divisor
298
299	movl	$0, %eax
300
301	divl	%ebp
302
303	movl	%eax, (%edi)
304	subl	$4, %edi
305
306	subl	$1, %ebx
307	jnz	L(divide_fraction)
308
309	jmp	L(divide_done)
310
311
312
313C -----------------------------------------------------------------------------
314
315L(mul_by_inverse):
316	C eax
317	C ebx	xsize
318	C ecx	size
319	C edx	carry
320	C esi	&src[size-1]
321	C edi	&dst[size+xsize-1]
322	C ebp	divisor
323
324	bsrl	%ebp, %eax		C 31-l
325	movd	%edx, %mm0		C carry
326	movd	%edx, %mm1		C carry
327	movl	%ecx, %edx		C size
328	movl	$31, %ecx
329
330	C
331
332	xorl	%eax, %ecx		C l = leading zeros on d
333	addl	$1, %eax
334
335	shll	%cl, %ebp		C d normalized
336	movd	%ecx, %mm7		C l
337	movl	%edx, %ecx		C size
338
339	movd	%eax, %mm6		C 32-l
340	movl	$-1, %edx
341	movl	$-1, %eax
342
343	C
344
345	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
346
347	divl	%ebp			C floor (b*(b-d)-1 / d)
348	movd	%ebp, %mm5		C d
349
350	C
351
352	movd	%eax, %mm4		C m
353
354
355L(start_preinv):
356	C eax	inverse
357	C ebx	xsize
358	C ecx	size
359	C edx
360	C esi	&src[size-1]
361	C edi	&dst[size+xsize-1]
362	C ebp
363	C
364	C mm0	carry
365	C mm1	carry
366	C mm2
367	C mm4	m
368	C mm5	d
369	C mm6	31-l
370	C mm7	l
371
372	psllq	%mm7, %mm0		C n2 = carry << l, for size==0
373
374	subl	$1, %ecx
375	jb	L(integer_none)
376
377	movd	(%esi), %mm0		C src high limb
378	punpckldq %mm1, %mm0
379	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
380	jz	L(integer_last)
381
382
383C The dependent chain here consists of
384C
385C	2   paddd    n1+n2
386C	8   pmuludq  m*(n1+n2)
387C	2   paddq    n2:nadj + m*(n1+n2)
388C	2   psrlq    q1
389C	8   pmuludq  d*q1
390C	2   psubq    (n-d)-q1*d
391C	2   psrlq    high n-(q1+1)*d mask
392C	2   pand     d masked
393C	2   paddd    n2+d addback
394C	--
395C	30
396C
397C But it seems to run at 32 cycles, so presumably there's something else
398C going on.
399
400	ALIGN(16)
401L(integer_top):
402	C eax
403	C ebx
404	C ecx	counter, size-1 to 0
405	C edx
406	C esi	src, decrementing
407	C edi	dst, decrementing
408	C
409	C mm0	n2
410	C mm4	m
411	C mm5	d
412	C mm6	32-l
413	C mm7	l
414
415	ASSERT(b,`C n2<d
416	 movd	%mm0, %eax
417	 movd	%mm5, %edx
418	 cmpl	%edx, %eax')
419
420	movd	-4(%esi), %mm1		C next src limbs
421	movd	(%esi), %mm2
422	leal	-4(%esi), %esi
423
424	punpckldq %mm2, %mm1
425	psrlq	%mm6, %mm1		C n10
426
427	movq	%mm1, %mm2		C n10
428	movq	%mm1, %mm3		C n10
429	psrad	$31, %mm1		C -n1
430	pand	%mm5, %mm1		C -n1 & d
431	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
432
433	psrld	$31, %mm2		C n1
434	paddd	%mm0, %mm2		C n2+n1
435	punpckldq %mm0, %mm1		C n2:nadj
436
437	pmuludq	%mm4, %mm2		C m*(n2+n1)
438
439	C
440
441	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
442	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
443	pcmpeqd	%mm2, %mm2		C FF...FF
444	psrlq	$63, %mm2		C 1
445
446	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
447
448	paddd	%mm1, %mm2		C q1+1
449	pmuludq	%mm5, %mm1		C q1*d
450
451	punpckldq %mm0, %mm3		C n = n2:n10
452	pxor	%mm0, %mm0
453
454	psubq	%mm5, %mm3		C n - d
455
456	C
457
458	psubq	%mm1, %mm3		C n - (q1+1)*d
459
460	por	%mm3, %mm0		C copy remainder -> new n2
461	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
462
463	ASSERT(be,`C 0 or -1
464	 movd	%mm3, %eax
465	 addl	$1, %eax
466	 cmpl	$1, %eax')
467
468	paddd	%mm3, %mm2		C q
469	pand	%mm5, %mm3		C mask & d
470
471	paddd	%mm3, %mm0		C addback if necessary
472	movd	%mm2, (%edi)
473	leal	-4(%edi), %edi
474
475	subl	$1, %ecx
476	ja	L(integer_top)
477
478
479L(integer_last):
480	C eax
481	C ebx	xsize
482	C ecx
483	C edx
484	C esi	&src[0]
485	C edi	&dst[xsize]
486	C
487	C mm0	n2
488	C mm4	m
489	C mm5	d
490	C mm6
491	C mm7	l
492
493	ASSERT(b,`C n2<d
494	 movd	%mm0, %eax
495	 movd	%mm5, %edx
496	 cmpl	%edx, %eax')
497
498	movd	(%esi), %mm1		C src[0]
499	psllq	%mm7, %mm1		C n10
500
501	movq	%mm1, %mm2		C n10
502	movq	%mm1, %mm3		C n10
503	psrad	$31, %mm1		C -n1
504	pand	%mm5, %mm1		C -n1 & d
505	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
506
507	psrld	$31, %mm2		C n1
508	paddd	%mm0, %mm2		C n2+n1
509	punpckldq %mm0, %mm1		C n2:nadj
510
511	pmuludq	%mm4, %mm2		C m*(n2+n1)
512
513	C
514
515	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
516	pcmpeqd	%mm2, %mm2		C FF...FF
517	psrlq	$63, %mm2		C 1
518
519	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
520	paddd	%mm1, %mm2		C q1
521
522	pmuludq	%mm5, %mm1		C q1*d
523	punpckldq %mm0, %mm3		C n
524	psubq	%mm5, %mm3		C n - d
525	pxor	%mm0, %mm0
526
527	C
528
529	psubq	%mm1, %mm3		C n - (q1+1)*d
530
531	por	%mm3, %mm0		C remainder -> n2
532	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
533
534	ASSERT(be,`C 0 or -1
535	 movd	%mm3, %eax
536	 addl	$1, %eax
537	 cmpl	$1, %eax')
538
539	paddd	%mm3, %mm2		C q
540	pand	%mm5, %mm3		C mask & d
541
542	paddd	%mm3, %mm0		C addback if necessary
543	movd	%mm2, (%edi)
544	leal	-4(%edi), %edi
545
546
547L(integer_none):
548	C eax
549	C ebx	xsize
550
551	orl	%ebx, %ebx
552	jnz	L(fraction_some)	C if xsize!=0
553
554
555L(fraction_done):
556	movl	SAVE_EBP, %ebp
557	psrld	%mm7, %mm0		C remainder
558
559	movl	SAVE_EDI, %edi
560	movd	%mm0, %eax
561
562	movl	SAVE_ESI, %esi
563	movl	SAVE_EBX, %ebx
564	emms
565	ret
566
567
568
569C -----------------------------------------------------------------------------
570C
571
572L(fraction_some):
573	C eax
574	C ebx	xsize
575	C ecx
576	C edx
577	C esi
578	C edi	&dst[xsize-1]
579	C ebp
580
581
582L(fraction_top):
583	C eax
584	C ebx	counter, xsize iterations
585	C ecx
586	C edx
587	C esi	src, decrementing
588	C edi	dst, decrementing
589	C
590	C mm0	n2
591	C mm4	m
592	C mm5	d
593	C mm6	32-l
594	C mm7	l
595
596	ASSERT(b,`C n2<d
597	 movd	%mm0, %eax
598	 movd	%mm5, %edx
599	 cmpl	%edx, %eax')
600
601	movq	%mm0, %mm1		C n2
602	pmuludq	%mm4, %mm0		C m*n2
603
604	pcmpeqd	%mm2, %mm2
605	psrlq	$63, %mm2
606
607	C
608
609	psrlq	$32, %mm0		C high(m*n2)
610
611	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)
612
613	paddd	%mm0, %mm2		C q1+1
614	pmuludq	%mm5, %mm0		C q1*d
615
616	psllq	$32, %mm1		C n = n2:0
617	psubq	%mm5, %mm1		C n - d
618
619	C
620
621	psubq	%mm0, %mm1		C r = n - (q1+1)*d
622	pxor	%mm0, %mm0
623
624	por	%mm1, %mm0		C r -> n2
625	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1
626
627	ASSERT(be,`C 0 or -1
628	 movd	%mm1, %eax
629	 addl	$1, %eax
630	 cmpl	$1, %eax')
631
632	paddd	%mm1, %mm2		C q
633	pand	%mm5, %mm1		C mask & d
634
635	paddd	%mm1, %mm0		C addback if necessary
636	movd	%mm2, (%edi)
637	leal	-4(%edi), %edi
638
639	subl	$1, %ebx
640	jne	L(fraction_top)
641
642
643	jmp	L(fraction_done)
644
645EPILOGUE()
646