xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/divrem_1.asm (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
2dnl  division.
3
4dnl  Copyright 1999-2002, 2004 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
36
37
38C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
39C                         mp_srcptr src, mp_size_t size,
40C                         mp_limb_t divisor);
41C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
42C                          mp_srcptr src, mp_size_t size,
43C                          mp_limb_t divisor, mp_limb_t carry);
44C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
45C                                mp_srcptr src, mp_size_t size,
46C                                mp_limb_t divisor, mp_limb_t inverse,
47C                                unsigned shift);
48C
49C Algorithm:
50C
51C The method and nomenclature follow part 8 of "Division by Invariant
52C Integers using Multiplication" by Granlund and Montgomery, reference in
53C gmp.texi.
54C
55C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
56C for m', and "d" for d_norm, which won't cause any confusion since it's
57C only the normalized divisor that's of any use in the code.  "b" is written
58C for 2^N, the size of a limb, N being 32 here.
59C
60C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
61C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer.  If
62C q1==0xFFFFFFFF, then q1+1 would overflow.  We branch to a special case
63C "q1_ff" if this occurs.  Since the true quotient is either q1 or q1+1 then
64C if q1==0xFFFFFFFF that must be the right value.
65C
66C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
67C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1.  This
68C then goes through as normal, and finding no addback required.  sbbl costs
69C an extra cycle over what the main loop code does, but it keeps code size
70C and complexity down.
71C
72C Notes:
73C
74C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
75C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
76C carry, since in normal circumstances that will be a very rare event.
77C
78C The test for skipping a division is branch free (once size>=1 is tested).
79C The store to the destination high limb is 0 when a divide is skipped, or
80C if it's not skipped then a copy of the src high limb is used.  The latter
81C is in case src==dst.
82C
83C There's a small bias towards expecting xsize==0, by having code for
84C xsize==0 in a straight line and xsize!=0 under forward jumps.
85C
86C Alternatives:
87C
88C If the divisor is normalized (high bit set) then a division step can
89C always be skipped, since the high destination limb is always 0 or 1 in
90C that case.  It doesn't seem worth checking for this though, since it
91C probably occurs infrequently, in particular note that big_base for a
92C decimal mpn_get_str is not normalized in a 32-bit limb.
93
94
95dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
96dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
97dnl
98dnl  The inverse takes about 50 cycles to calculate, but after that the
99dnl  multiply is 17 c/l versus division at 42 c/l.
100dnl
101dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
102dnl  even more so on the fractional part.
103
104deflit(MUL_THRESHOLD, 3)
105
106
107defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
108defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
109defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
110defframe(PARAM_DIVISOR,20)
111defframe(PARAM_SIZE,   16)
112defframe(PARAM_SRC,    12)
113defframe(PARAM_XSIZE,  8)
114defframe(PARAM_DST,    4)
115
116defframe(SAVE_EBX,    -4)
117defframe(SAVE_ESI,    -8)
118defframe(SAVE_EDI,    -12)
119defframe(SAVE_EBP,    -16)
120
121defframe(VAR_NORM,    -20)
122defframe(VAR_INVERSE, -24)
123defframe(VAR_SRC,     -28)
124defframe(VAR_DST,     -32)
125defframe(VAR_DST_STOP,-36)
126
127deflit(STACK_SPACE, 36)
128
129	TEXT
130	ALIGN(32)
131
132PROLOGUE(mpn_preinv_divrem_1)
133deflit(`FRAME',0)
134	movl	PARAM_XSIZE, %ecx
135	movl	PARAM_DST, %edx
136	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
137
138	movl	%esi, SAVE_ESI
139	movl	PARAM_SRC, %esi
140
141	movl	%ebx, SAVE_EBX
142	movl	PARAM_SIZE, %ebx
143
144	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
145	movl	%ebp, SAVE_EBP
146	movl	PARAM_DIVISOR, %ebp
147
148	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
149	movl	%edi, SAVE_EDI
150	xorl	%edi, %edi		C carry
151
152	movl	-4(%esi,%ebx,4), %eax	C src high limb
153	xor	%ecx, %ecx
154
155	C
156
157	C
158
159	cmpl	%ebp, %eax		C high cmp divisor
160
161	cmovc(	%eax, %edi)		C high is carry if high<divisor
162	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
163					C (the latter in case src==dst)
164
165	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
166	sbbl	$0, %ebx		C skip one division if high<divisor
167	movl	PARAM_PREINV_SHIFT, %ecx
168
169	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
170	movl	$32, %eax
171
172	movl	%edx, VAR_DST		C &dst[xsize+size]
173
174	shll	%cl, %ebp		C d normalized
175	subl	%ecx, %eax
176	movl	%ecx, VAR_NORM
177
178	movd	%eax, %mm7		C rshift
179	movl	PARAM_PREINV_INVERSE, %eax
180	jmp	L(start_preinv)
181
182EPILOGUE()
183
184
185	ALIGN(16)
186
187PROLOGUE(mpn_divrem_1c)
188deflit(`FRAME',0)
189	movl	PARAM_CARRY, %edx
190	movl	PARAM_SIZE, %ecx
191	subl	$STACK_SPACE, %esp
192deflit(`FRAME',STACK_SPACE)
193
194	movl	%ebx, SAVE_EBX
195	movl	PARAM_XSIZE, %ebx
196
197	movl	%edi, SAVE_EDI
198	movl	PARAM_DST, %edi
199
200	movl	%ebp, SAVE_EBP
201	movl	PARAM_DIVISOR, %ebp
202
203	movl	%esi, SAVE_ESI
204	movl	PARAM_SRC, %esi
205
206	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
207	jmp	L(start_1c)
208
209EPILOGUE()
210
211
212	C offset 0xa1, close enough to aligned
213PROLOGUE(mpn_divrem_1)
214deflit(`FRAME',0)
215
216	movl	PARAM_SIZE, %ecx
217	movl	$0, %edx		C initial carry (if can't skip a div)
218	subl	$STACK_SPACE, %esp
219deflit(`FRAME',STACK_SPACE)
220
221	movl	%esi, SAVE_ESI
222	movl	PARAM_SRC, %esi
223
224	movl	%ebx, SAVE_EBX
225	movl	PARAM_XSIZE, %ebx
226
227	movl	%ebp, SAVE_EBP
228	movl	PARAM_DIVISOR, %ebp
229	orl	%ecx, %ecx		C size
230
231	movl	%edi, SAVE_EDI
232	movl	PARAM_DST, %edi
233	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
234
235	jz	L(no_skip_div)		C if size==0
236	movl	-4(%esi,%ecx,4), %eax	C src high limb
237	xorl	%esi, %esi
238
239	cmpl	%ebp, %eax		C high cmp divisor
240
241	cmovc(	%eax, %edx)		C high is carry if high<divisor
242	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
243
244	movl	%esi, (%edi,%ecx,4)	C dst high limb
245	sbbl	$0, %ecx		C size-1 if high<divisor
246	movl	PARAM_SRC, %esi		C reload
247L(no_skip_div):
248
249
250L(start_1c):
251	C eax
252	C ebx	xsize
253	C ecx	size
254	C edx	carry
255	C esi	src
256	C edi	&dst[xsize-1]
257	C ebp	divisor
258
259	leal	(%ebx,%ecx), %eax	C size+xsize
260	cmpl	$MUL_THRESHOLD, %eax
261	jae	L(mul_by_inverse)
262
263
264C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
265C It'd be possible to write them out without the looping, but no speedup
266C would be expected.
267C
268C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
269C integer part, but curiously not on the fractional part, where %ebp is a
270C (fixed) couple of cycles faster.
271
272	orl	%ecx, %ecx
273	jz	L(divide_no_integer)
274
275L(divide_integer):
276	C eax	scratch (quotient)
277	C ebx	xsize
278	C ecx	counter
279	C edx	scratch (remainder)
280	C esi	src
281	C edi	&dst[xsize-1]
282	C ebp	divisor
283
284	movl	-4(%esi,%ecx,4), %eax
285
286	divl	PARAM_DIVISOR
287
288	movl	%eax, (%edi,%ecx,4)
289	decl	%ecx
290	jnz	L(divide_integer)
291
292
293L(divide_no_integer):
294	movl	PARAM_DST, %edi
295	orl	%ebx, %ebx
296	jnz	L(divide_fraction)
297
298L(divide_done):
299	movl	SAVE_ESI, %esi
300	movl	SAVE_EDI, %edi
301	movl	%edx, %eax
302
303	movl	SAVE_EBX, %ebx
304	movl	SAVE_EBP, %ebp
305	addl	$STACK_SPACE, %esp
306
307	ret
308
309
310L(divide_fraction):
311	C eax	scratch (quotient)
312	C ebx	counter
313	C ecx
314	C edx	scratch (remainder)
315	C esi
316	C edi	dst
317	C ebp	divisor
318
319	movl	$0, %eax
320
321	divl	%ebp
322
323	movl	%eax, -4(%edi,%ebx,4)
324	decl	%ebx
325	jnz	L(divide_fraction)
326
327	jmp	L(divide_done)
328
329
330
331C -----------------------------------------------------------------------------
332
333L(mul_by_inverse):
334	C eax
335	C ebx	xsize
336	C ecx	size
337	C edx	carry
338	C esi	src
339	C edi	&dst[xsize-1]
340	C ebp	divisor
341
342	bsrl	%ebp, %eax		C 31-l
343
344	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
345	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
346
347	movl	%edi, VAR_DST
348	movl	%ebx, VAR_DST_STOP
349
350	movl	%ecx, %ebx		C size
351	movl	$31, %ecx
352
353	movl	%edx, %edi		C carry
354	movl	$-1, %edx
355
356	C
357
358	xorl	%eax, %ecx		C l
359	incl	%eax			C 32-l
360
361	shll	%cl, %ebp		C d normalized
362	movl	%ecx, VAR_NORM
363
364	movd	%eax, %mm7
365
366	movl	$-1, %eax
367	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
368
369	divl	%ebp			C floor (b*(b-d)-1) / d
370
371L(start_preinv):
372	C eax	inverse
373	C ebx	size
374	C ecx	shift
375	C edx
376	C esi	src
377	C edi	carry
378	C ebp	divisor
379	C
380	C mm7	rshift
381
382	orl	%ebx, %ebx		C size
383	movl	%eax, VAR_INVERSE
384	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
385
386	jz	L(start_zero)
387	movl	%eax, VAR_SRC
388	cmpl	$1, %ebx
389
390	movl	8(%eax), %esi		C src high limb
391	jz	L(start_one)
392
393L(start_two_or_more):
394	movl	4(%eax), %edx		C src second highest limb
395
396	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
397
398	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
399
400	cmpl	$2, %ebx
401	je	L(integer_two_left)
402	jmp	L(integer_top)
403
404
405L(start_one):
406	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
407
408	shll	%cl, %esi		C n10 = high << l
409	movl	%eax, VAR_SRC
410	jmp	L(integer_one_left)
411
412
413L(start_zero):
414	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
415	C skipped a division.
416
417	shll	%cl, %edi		C n2 = carry << l
418	movl	%edi, %eax		C return value for zero_done
419	cmpl	$0, PARAM_XSIZE
420
421	je	L(zero_done)
422	jmp	L(fraction_some)
423
424
425
426C -----------------------------------------------------------------------------
427C
428C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
429C execution.  The instruction scheduling is important, with various
430C apparently equivalent forms running 1 to 5 cycles slower.
431C
432C A lower bound for the time would seem to be 16 cycles, based on the
433C following successive dependencies.
434C
435C		      cycles
436C		n2+n1	1
437C		mul	6
438C		q1+1	1
439C		mul	6
440C		sub	1
441C		addback	1
442C		       ---
443C		       16
444C
445C This chain is what the loop has already, but 16 cycles isn't achieved.
446C K7 has enough decode, and probably enough execute (depending maybe on what
447C a mul actually consumes), but nothing running under 17 has been found.
448C
449C In theory n2+n1 could be done in the sub and addback stages (by
450C calculating both n2 and n2+n1 there), but lack of registers makes this an
451C unlikely proposition.
452C
453C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
454C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
455C chain, and nothing better than 18 cycles has been found when using it.
456C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
457C be an extremely rare event.
458C
459C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
460C if some special data is coming out with this always, the q1_ff special
461C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
462C induce the q1_ff case, for speed measurements or testing.  Note that
463C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
464C
465C The instruction groupings and empty comments show the cycles for a naive
466C in-order view of the code (conveniently ignoring the load latency on
467C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
468C to the extent that out-of-order execution rearranges it.  In this case
469C there's 19 cycles shown, but it executes at 17.
470
471	ALIGN(16)
472L(integer_top):
473	C eax	scratch
474	C ebx	scratch (nadj, q1)
475	C ecx	scratch (src, dst)
476	C edx	scratch
477	C esi	n10
478	C edi	n2
479	C ebp	divisor
480	C
481	C mm0	scratch (src qword)
482	C mm7	rshift for normalization
483
484	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
485	movl	%edi, %eax         C n2
486	movl	VAR_SRC, %ecx
487
488	leal	(%ebp,%esi), %ebx
489	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
490	sbbl	$-1, %eax          C n2+n1
491
492	mull	VAR_INVERSE        C m*(n2+n1)
493
494	movq	(%ecx), %mm0       C next limb and the one below it
495	subl	$4, %ecx
496
497	movl	%ecx, VAR_SRC
498
499	C
500
501	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
502	leal	1(%edi), %ebx      C n2+1
503	movl	%ebp, %eax	   C d
504
505	C
506
507	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
508	jz	L(q1_ff)
509	movl	VAR_DST, %ecx
510
511	mull	%ebx		   C (q1+1)*d
512
513	psrlq	%mm7, %mm0
514
515	leal	-4(%ecx), %ecx
516
517	C
518
519	subl	%eax, %esi
520	movl	VAR_DST_STOP, %eax
521
522	C
523
524	sbbl	%edx, %edi	   C n - (q1+1)*d
525	movl	%esi, %edi	   C remainder -> n2
526	leal	(%ebp,%esi), %edx
527
528	movd	%mm0, %esi
529
530	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
531	sbbl	$0, %ebx	   C q
532	cmpl	%eax, %ecx
533
534	movl	%ebx, (%ecx)
535	movl	%ecx, VAR_DST
536	jne	L(integer_top)
537
538
539L(integer_loop_done):
540
541
542C -----------------------------------------------------------------------------
543C
544C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
545C q1_ff special case.  This make the code a bit smaller and simpler, and
546C costs only 1 cycle (each).
547
548L(integer_two_left):
549	C eax	scratch
550	C ebx	scratch (nadj, q1)
551	C ecx	scratch (src, dst)
552	C edx	scratch
553	C esi	n10
554	C edi	n2
555	C ebp	divisor
556	C
557	C mm7	rshift
558
559	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
560	movl	%edi, %eax         C n2
561	movl	PARAM_SRC, %ecx
562
563	leal	(%ebp,%esi), %ebx
564	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
565	sbbl	$-1, %eax          C n2+n1
566
567	mull	VAR_INVERSE        C m*(n2+n1)
568
569	movd	(%ecx), %mm0	   C src low limb
570
571	movl	VAR_DST_STOP, %ecx
572
573	C
574
575	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
576	leal	1(%edi), %ebx      C n2+1
577	movl	%ebp, %eax	   C d
578
579	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
580
581	sbbl	$0, %ebx
582
583	mull	%ebx		   C (q1+1)*d
584
585	psllq	$32, %mm0
586
587	psrlq	%mm7, %mm0
588
589	C
590
591	subl	%eax, %esi
592
593	C
594
595	sbbl	%edx, %edi	   C n - (q1+1)*d
596	movl	%esi, %edi	   C remainder -> n2
597	leal	(%ebp,%esi), %edx
598
599	movd	%mm0, %esi
600
601	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
602	sbbl	$0, %ebx	   C q
603
604	movl	%ebx, -4(%ecx)
605
606
607C -----------------------------------------------------------------------------
608L(integer_one_left):
609	C eax	scratch
610	C ebx	scratch (nadj, q1)
611	C ecx	dst
612	C edx	scratch
613	C esi	n10
614	C edi	n2
615	C ebp	divisor
616	C
617	C mm7	rshift
618
619	movl	VAR_DST_STOP, %ecx
620	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
621	movl	%edi, %eax         C n2
622
623	leal	(%ebp,%esi), %ebx
624	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
625	sbbl	$-1, %eax          C n2+n1
626
627	mull	VAR_INVERSE        C m*(n2+n1)
628
629	C
630
631	C
632
633	C
634
635	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
636	leal	1(%edi), %ebx      C n2+1
637	movl	%ebp, %eax	   C d
638
639	C
640
641	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
642
643	sbbl	$0, %ebx           C q1 if q1+1 overflowed
644
645	mull	%ebx
646
647	C
648
649	C
650
651	C
652
653	subl	%eax, %esi
654
655	C
656
657	sbbl	%edx, %edi	   C n - (q1+1)*d
658	movl	%esi, %edi	   C remainder -> n2
659	leal	(%ebp,%esi), %edx
660
661	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
662	sbbl	$0, %ebx	   C q
663
664	movl	%ebx, -8(%ecx)
665	subl	$8, %ecx
666
667
668
669L(integer_none):
670	cmpl	$0, PARAM_XSIZE
671	jne	L(fraction_some)
672
673	movl	%edi, %eax
674L(fraction_done):
675	movl	VAR_NORM, %ecx
676L(zero_done):
677	movl	SAVE_EBP, %ebp
678
679	movl	SAVE_EDI, %edi
680	movl	SAVE_ESI, %esi
681
682	movl	SAVE_EBX, %ebx
683	addl	$STACK_SPACE, %esp
684
685	shrl	%cl, %eax
686	emms
687
688	ret
689
690
691C -----------------------------------------------------------------------------
692C
693C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
694C of q*d is simply -d and the remainder n-q*d = n10+d
695
696L(q1_ff):
697	C eax	(divisor)
698	C ebx	(q1+1 == 0)
699	C ecx
700	C edx
701	C esi	n10
702	C edi	n2
703	C ebp	divisor
704
705	movl	VAR_DST, %ecx
706	movl	VAR_DST_STOP, %edx
707	subl	$4, %ecx
708
709	psrlq	%mm7, %mm0
710	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
711	movl	%ecx, VAR_DST
712
713	movd	%mm0, %esi		C next n10
714
715	movl	$-1, (%ecx)
716	cmpl	%ecx, %edx
717	jne	L(integer_top)
718
719	jmp	L(integer_loop_done)
720
721
722
723C -----------------------------------------------------------------------------
724C
725C Being the fractional part, the "source" limbs are all zero, meaning
726C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
727C
728C The loop runs at 15 cycles.  The dependent chain is the same as the
729C general case above, but without the n2+n1 stage (due to n1==0), so 15
730C would seem to be the lower bound.
731C
732C A not entirely obvious simplification is that q1+1 never overflows a limb,
733C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
734C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
735C rnd() means rounding down to a multiple of d.
736C
737C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
738C		     = m*d + b*d - m - b
739C		     = floor((b(b-d)-1)/d)*d + b*d - m - b
740C		     = rnd(b(b-d)-1) + b*d - m - b
741C		     = rnd(b(b-d)-1 + b*d) - m - b
742C		     = rnd(b*b-1) - m - b
743C		     <= (b-2)*b
744C
745C Unchanged from the general case is that the final quotient limb q can be
746C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
747C equation 8.4 of the paper which simplifies as follows when n1==0 and
748C n0==0.
749C
750C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
751C
752C As before, the instruction groupings and empty comments show a naive
753C in-order view of the code, which is made a nonsense by out of order
754C execution.  There's 17 cycles shown, but it executes at 15.
755C
756C Rotating the store q and remainder->n2 instructions up to the top of the
757C loop gets the run time down from 16 to 15.
758
759	ALIGN(16)
760L(fraction_some):
761	C eax
762	C ebx
763	C ecx
764	C edx
765	C esi
766	C edi	carry
767	C ebp	divisor
768
769	movl	PARAM_DST, %esi
770	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
771	movl	%edi, %eax
772
773	subl	$8, %ecx		C &dst[xsize]
774	jmp	L(fraction_entry)
775
776
777	ALIGN(16)
778L(fraction_top):
779	C eax	n2 carry, then scratch
780	C ebx	scratch (nadj, q1)
781	C ecx	dst, decrementing
782	C edx	scratch
783	C esi	dst stop point
784	C edi	(will be n2)
785	C ebp	divisor
786
787	movl	%ebx, (%ecx)	C previous q
788	movl	%eax, %edi	C remainder->n2
789
790L(fraction_entry):
791	mull	VAR_INVERSE	C m*n2
792
793	movl	%ebp, %eax	C d
794	subl	$4, %ecx	C dst
795	leal	1(%edi), %ebx
796
797	C
798
799	C
800
801	C
802
803	C
804
805	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
806
807	mull	%ebx		C (q1+1)*d
808
809	C
810
811	C
812
813	C
814
815	negl	%eax		C low of n - (q1+1)*d
816
817	C
818
819	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
820	leal	(%ebp,%eax), %edx
821
822	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
823	sbbl	$0, %ebx	C q
824	cmpl	%esi, %ecx
825
826	jne	L(fraction_top)
827
828
829	movl	%ebx, (%ecx)
830	jmp	L(fraction_done)
831
832EPILOGUE()
833