xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/addlsh1_n.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
36C The innerloop is 2*3-way unrolled, which is best we can do with the available
37C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
38C cannot feed carry between operations there.
39
40C			    cycles/limb
41C P5
42C P6 model 0-8,10-12
43C P6 model 9  (Banias)
44C P6 model 13 (Dothan)		 5.4	(worse than add_n + lshift)
45C P4 model 0  (Willamette)
46C P4 model 1  (?)
47C P4 model 2  (Northwood)
48C P4 model 3  (Prescott)
49C P4 model 4  (Nocona)
50C Intel Atom			 6
51C AMD K6			 ?
52C AMD K7			 2.5
53C AMD K8
54
55C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
56C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
57C that means we need an initial magic multiply.
58C
59C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
60C cannot do rsblsh1_n since we feed carry from the shift blocks to the
61C add/subtract blocks, which is right for addition but reversed for
62C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
63C without losing any time, since we're not issue limited but carry recurrency
64C latency.
65C
66C Breaking carry recurrency might be a good idea.  We would then need separate
67C registers for the shift carry and add/subtract carry, which in turn would
68C force us to 2*2-way unrolling.
69
70defframe(PARAM_SIZE,	16)
71defframe(PARAM_DBLD,	12)
72defframe(PARAM_SRC,	 8)
73defframe(PARAM_DST,	 4)
74
75dnl  re-use parameter space
76define(VAR_COUNT,`PARAM_DST')
77define(VAR_TMP,`PARAM_DBLD')
78
79ASM_START()
80	TEXT
81	ALIGN(8)
82PROLOGUE(mpn_addlsh1_n)
83deflit(`FRAME',0)
84
85define(`rp',  `%edi')
86define(`up',  `%esi')
87define(`vp',  `%ebp')
88
89	mov	$0x2aaaaaab, %eax
90
91	push	%ebx			FRAME_pushl()
92	mov	PARAM_SIZE, %ebx	C size
93
94	push	rp			FRAME_pushl()
95	mov	PARAM_DST, rp
96
97	mul	%ebx
98
99	push	up			FRAME_pushl()
100	mov	PARAM_SRC, up
101
102	not	%edx			C count = -(size\8)-1
103	mov	%edx, VAR_COUNT
104
105	push	vp			FRAME_pushl()
106	mov	PARAM_DBLD, vp
107
108	lea	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
109	xor	%edx, %edx
110	lea	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
111	or	%ebx, %ebx
112	jz	L(exact)
113
114L(oop):
115ifdef(`CPU_P6',`
116	shr	%edx ')			C restore 2nd saved carry bit
117	mov	(vp), %eax
118	adc	%eax, %eax
119	rcr	%edx			C restore 1st saved carry bit
120	lea	4(vp), vp
121	adc	(up), %eax
122	lea	4(up), up
123	adc	%edx, %edx		C save a carry bit in edx
124ifdef(`CPU_P6',`
125	adc	%edx, %edx ')		C save another carry bit in edx
126	dec	%ebx
127	mov	%eax, (rp)
128	lea	4(rp), rp
129	jnz	L(oop)
130	mov	vp, VAR_TMP
131L(exact):
132	incl	VAR_COUNT
133	jz	L(end)
134
135	ALIGN(16)
136L(top):
137ifdef(`CPU_P6',`
138	shr	%edx ')			C restore 2nd saved carry bit
139	mov	(vp), %eax
140	adc	%eax, %eax
141	mov	4(vp), %ebx
142	adc	%ebx, %ebx
143	mov	8(vp), %ecx
144	adc	%ecx, %ecx
145
146	rcr	%edx			C restore 1st saved carry bit
147
148	adc	(up), %eax
149	mov	%eax, (rp)
150	adc	4(up), %ebx
151	mov	%ebx, 4(rp)
152	adc	8(up), %ecx
153	mov	%ecx, 8(rp)
154
155	mov	12(vp), %eax
156	adc	%eax, %eax
157	mov	16(vp), %ebx
158	adc	%ebx, %ebx
159	mov	20(vp), %ecx
160	adc	%ecx, %ecx
161
162	lea	24(vp), vp
163	adc	%edx, %edx		C save a carry bit in edx
164
165	adc	12(up), %eax
166	mov	%eax, 12(rp)
167	adc	16(up), %ebx
168	mov	%ebx, 16(rp)
169	adc	20(up), %ecx
170
171	lea	24(up), up
172
173ifdef(`CPU_P6',`
174	adc	%edx, %edx ')		C save another carry bit in edx
175	mov	%ecx, 20(rp)
176	incl	VAR_COUNT
177	lea	24(rp), rp
178	jne	L(top)
179
180L(end):
181	pop	vp			FRAME_popl()
182	pop	up			FRAME_popl()
183
184ifdef(`CPU_P6',`
185	xor	%eax, %eax
186	shr	$1, %edx
187	adc	%edx, %eax
188',`
189	adc	$0, %edx
190	mov	%edx, %eax
191')
192	pop	rp			FRAME_popl()
193	pop	%ebx			FRAME_popl()
194	ret
195EPILOGUE()
196ASM_END()
197