xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/sublsh1_n.asm (revision 501cd18a74d52bfcca7d9e7e3b0d472bbc870558)
1dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns.  The
25C innerloop is 2*3-way unrolled, which is best we can do with the available
26C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
27C cannot feed carry between operations there.
28
29C			    cycles/limb
30C P5
31C P6 model 0-8,10-12
32C P6 model 9  (Banias)
33C P6 model 13 (Dothan)
34C P4 model 0  (Willamette)
35C P4 model 1  (?)
36C P4 model 2  (Northwood)
37C P4 model 3  (Prescott)
38C P4 model 4  (Nocona)
39C Intel Atom			 6.75
40C AMD K6
41C AMD K7
42C AMD K8
43
44C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
45C processors.  It uses 2*4-way unrolling, for good reasons.
46C
47C Breaking carry recurrency might be a good idea.  We would then need separate
48C registers for the shift carry and add/subtract carry, which in turn would
49C force is to 2*2-way unrolling.
50
51defframe(PARAM_SIZE,	12)
52defframe(PARAM_SRC,	 8)
53defframe(PARAM_DST,	 4)
54
55dnl  re-use parameter space
56define(VAR_COUNT,`PARAM_SIZE')
57define(SAVE_EBX,`PARAM_SRC')
58define(SAVE_EBP,`PARAM_DST')
59
60ASM_START()
61	TEXT
62	ALIGN(8)
63PROLOGUE(mpn_sublsh1_n_ip1)
64deflit(`FRAME',0)
65
66define(`rp',  `%edi')
67define(`up',  `%esi')
68
69	mov	PARAM_SIZE, %eax	C size
70	push	up			FRAME_pushl()
71	push	rp			FRAME_pushl()
72	xor	%edx, %edx
73	mov	PARAM_SRC, up
74	mov	PARAM_DST, rp
75	mov	%ebx, SAVE_EBX
76	mov	%eax, %ebx
77	shr	$3, %eax
78
79	not	%eax			C count = -(size\8)-i
80	and	$7, %ebx		C size % 8
81	jz	L(exact)
82
83L(oop):
84ifdef(`CPU_P6',`
85	shr	%edx ')			C restore 2nd saved carry bit
86	mov	(up), %ecx
87	adc	%ecx, %ecx
88	rcr	%edx			C restore 1st saved carry bit
89	lea	4(up), up
90	sbb	%ecx, (rp)
91	lea	4(rp), rp
92	adc	%edx, %edx		C save a carry bit in edx
93ifdef(`CPU_P6',`
94	adc	%edx, %edx ')		C save another carry bit in edx
95	dec	%ebx
96	jnz	L(oop)
97L(exact):
98	inc	%eax
99	jz	L(end)
100	mov	%eax, VAR_COUNT
101	mov	%ebp, SAVE_EBP
102
103	ALIGN(16)
104L(top):
105ifdef(`CPU_P6',`
106	shr	%edx ')			C restore 2nd saved carry bit
107	mov	(up), %eax
108	adc	%eax, %eax
109	mov	4(up), %ebx
110	adc	%ebx, %ebx
111	mov	8(up), %ecx
112	adc	%ecx, %ecx
113	mov	12(up), %ebp
114	adc	%ebp, %ebp
115
116	rcr	%edx			C restore 1st saved carry bit
117
118	sbb	%eax, (rp)
119	sbb	%ebx, 4(rp)
120	sbb	%ecx, 8(rp)
121	sbb	%ebp, 12(rp)
122
123	mov	16(up), %eax
124	adc	%eax, %eax
125	mov	20(up), %ebx
126	adc	%ebx, %ebx
127	mov	24(up), %ecx
128	adc	%ecx, %ecx
129	mov	28(up), %ebp
130	adc	%ebp, %ebp
131
132	lea	32(up), up
133	adc	%edx, %edx		C save a carry bit in edx
134
135	sbb	%eax, 16(rp)
136	sbb	%ebx, 20(rp)
137	sbb	%ecx, 24(rp)
138	sbb	%ebp, 28(rp)
139
140ifdef(`CPU_P6',`
141	adc	%edx, %edx ')		C save another carry bit in edx
142	incl	VAR_COUNT
143	lea	32(rp), rp
144	jne	L(top)
145
146	mov	SAVE_EBP, %ebp
147L(end):
148	mov	SAVE_EBX, %ebx
149
150ifdef(`CPU_P6',`
151	xor	%eax, %eax
152	shr	$1, %edx
153	adc	%edx, %eax
154',`
155	adc	$0, %edx
156	mov	%edx, %eax
157')
158	pop	rp			FRAME_popl()
159	pop	up			FRAME_popl()
160	ret
161EPILOGUE()
162ASM_END()
163