xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/aorsmul_1.asm (revision 154bfe8e089c1a0a4e9ed8414f08d3da90949162)
1dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C			    cycles/limb
35C P5
36C P6 model 0-8,10-12
37C P6 model 9  (Banias)		 6.5
38C P6 model 13 (Dothan)
39C P4 model 0  (Willamette)
40C P4 model 1  (?)
41C P4 model 2  (Northwood)
42C P4 model 3  (Prescott)
43C P4 model 4  (Nocona)
44C AMD K6
45C AMD K7			 3.75
46C AMD K8
47
48C TODO
49C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
50C    but lose by 2x for n == 1.
51
52ifdef(`OPERATION_addmul_1',`
53      define(`ADDSUB',        `add')
54      define(`func',  `mpn_addmul_1')
55')
56ifdef(`OPERATION_submul_1',`
57      define(`ADDSUB',        `sub')
58      define(`func',  `mpn_submul_1')
59')
60
61MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
62
63ASM_START()
64	TEXT
65	ALIGN(16)
66PROLOGUE(func)
67	add	$-16, %esp
68	mov	%ebp, (%esp)
69	mov	%ebx, 4(%esp)
70	mov	%esi, 8(%esp)
71	mov	%edi, 12(%esp)
72
73	mov	20(%esp), %edi
74	mov	24(%esp), %esi
75	mov	28(%esp), %eax
76	mov	32(%esp), %ecx
77	mov	%eax, %ebx
78	shr	$2, %eax
79	mov	%eax, 28(%esp)
80	mov	(%esi), %eax
81	and	$3, %ebx
82	jz	L(b0)
83	cmp	$2, %ebx
84	jz	L(b2)
85	jg	L(b3)
86
87L(b1):	lea	-4(%esi), %esi
88	lea	-4(%edi), %edi
89	mul	%ecx
90	mov	%eax, %ebx
91	mov	%edx, %ebp
92	cmpl	$0, 28(%esp)
93	jz	L(cj1)
94	mov	8(%esi), %eax
95	jmp	L(1)
96
97L(b2):	mul	%ecx
98	mov	%eax, %ebp
99	mov	4(%esi), %eax
100	mov	%edx, %ebx
101	cmpl	$0, 28(%esp)
102	jne	L(2)
103	jmp	L(cj2)
104
105L(b3):	lea	-12(%esi), %esi
106	lea	-12(%edi), %edi
107	mul	%ecx
108	mov	%eax, %ebx
109	mov	%edx, %ebp
110	mov	16(%esi), %eax
111	incl	28(%esp)
112	jmp	L(3)
113
114L(b0):	lea	-8(%esi), %esi
115	lea	-8(%edi), %edi
116	mul	%ecx
117	mov	%eax, %ebp
118	mov	12(%esi), %eax
119	mov	%edx, %ebx
120	jmp	L(0)
121
122	ALIGN(16)
123L(top):	lea	16(%edi), %edi
124L(2):	mul	%ecx
125	ADDSUB	%ebp, 0(%edi)
126	mov	$0, %ebp
127	adc	%eax, %ebx
128	mov	8(%esi), %eax
129	adc	%edx, %ebp
130L(1):	mul	%ecx
131	ADDSUB	%ebx, 4(%edi)
132	mov	$0, %ebx
133	adc	%eax, %ebp
134	mov	12(%esi), %eax
135	adc	%edx, %ebx
136L(0):	mul	%ecx
137	ADDSUB	%ebp, 8(%edi)
138	mov	$0, %ebp
139	adc	%eax, %ebx
140	adc	%edx, %ebp
141	mov	16(%esi), %eax
142L(3):	mul	%ecx
143	ADDSUB	%ebx, 12(%edi)
144	adc	%eax, %ebp
145	mov	20(%esi), %eax
146	lea	16(%esi), %esi
147	mov	$0, %ebx
148	adc	%edx, %ebx
149	decl	28(%esp)
150	jnz	L(top)
151
152L(end):	lea	16(%edi), %edi
153L(cj2):	mul	%ecx
154	ADDSUB	%ebp, (%edi)
155	adc	%eax, %ebx
156	adc	$0, %edx
157L(cj1):	ADDSUB	%ebx, 4(%edi)
158	adc	$0, %edx
159	mov	%edx, %eax
160	mov	(%esp), %ebp
161	mov	4(%esp), %ebx
162	mov	8(%esp), %esi
163	mov	12(%esp), %edi
164	add	$16, %esp
165	ret
166EPILOGUE()
167ASM_END()
168