xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/atom/addmul_2.asm (revision 901e7e84758515fbf39dfc064cb0b45ab146d8b0)
1dnl  AMD64 mpn_addmul_2 optimised for Intel Atom.
2
3dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb	best
34C AMD K8,K9
35C AMD K10
36C AMD bd1
37C AMD bd2
38C AMD bobcat
39C AMD jaguar
40C Intel P4
41C Intel PNR
42C Intel NHM
43C Intel SBR
44C Intel IBR
45C Intel HWL
46C Intel BWL
47C Intel atom	18.8		this
48C VIA nano
49
50C The loop of this code is the result of running a code generation and
51C optimisation tool suite written by David Harvey and Torbjorn Granlund.
52
53define(`rp',      `%rdi')   C rcx
54define(`up',      `%rsi')   C rdx
55define(`n_param', `%rdx')   C r8
56define(`vp',      `%rcx')   C r9
57
58define(`v0', `%r8')
59define(`v1', `%r9')
60define(`w0', `%rbx')
61define(`w1', `%rcx')
62define(`w2', `%rbp')
63define(`w3', `%r10')
64define(`n',  `%r11')
65
66ABI_SUPPORT(DOS64)
67ABI_SUPPORT(STD64)
68
69ASM_START()
70	TEXT
71	ALIGN(16)
72PROLOGUE(mpn_addmul_2)
73	FUNC_ENTRY(4)
74	push	%rbx
75	push	%rbp
76
77	mov	(up), %rax
78
79	mov	(vp), v0
80	mov	8(vp), v1
81
82	mov	n_param, n
83	mul	v0
84
85	test	$1, R8(n)
86	jnz	L(bx1)
87
88L(bx0):	test	$2, R8(n)
89	jnz	L(b10)
90
91L(b00):	mov	%rax, w0
92	mov	(up), %rax
93	mov	%rdx, w1
94	xor	R32(w2), R32(w2)
95	lea	-8(rp), rp
96	jmp	L(lo0)
97
98L(b10):	mov	%rax, w2
99	mov	(up), %rax
100	mov	%rdx, w3
101	xor	R32(w0), R32(w0)
102	lea	-16(up), up
103	lea	-24(rp), rp
104	jmp	L(lo2)
105
106L(bx1):	test	$2, R8(n)
107	jnz	L(b11)
108
109L(b01):	mov	%rax, w3
110	mov	%rdx, w0
111	mov	(up), %rax
112	xor	R32(w1), R32(w1)
113	lea	8(up), up
114	dec	n
115	jmp	L(lo1)
116
117L(b11):	mov	%rax, w1
118	mov	(up), %rax
119	mov	%rdx, w2
120	xor	R32(w3), R32(w3)
121	lea	-8(up), up
122	lea	-16(rp), rp
123	jmp	L(lo3)
124
125	ALIGN(16)
126L(top):
127L(lo1):	mul	v1
128	add	w3, (rp)
129	mov	$0, R32(w2)
130	adc	%rax, w0
131	mov	(up), %rax
132	adc	%rdx, w1
133	mul	v0
134	add	%rax, w0
135	mov	(up), %rax
136	adc	%rdx, w1
137	adc	$0, R32(w2)
138L(lo0):	mul	v1
139	add	w0, 8(rp)
140	adc	%rax, w1
141	mov	8(up), %rax
142	mov	$0, R32(w3)
143	adc	%rdx, w2
144	mul	v0
145	add	%rax, w1
146	mov	8(up), %rax
147	adc	%rdx, w2
148	adc	$0, R32(w3)
149L(lo3):	mul	v1
150	add	w1, 16(rp)
151	adc	%rax, w2
152	mov	16(up), %rax
153	mov	$0, R32(w0)
154	adc	%rdx, w3
155	mul	v0
156	add	%rax, w2
157	mov	16(up), %rax
158	adc	%rdx, w3
159	adc	$0, R32(w0)
160L(lo2):	mul	v1
161	add	w2, 24(rp)
162	adc	%rax, w3
163	mov	24(up), %rax
164	adc	%rdx, w0
165	mov	$0, R32(w1)
166	lea	32(rp), rp
167	mul	v0
168	lea	32(up), up
169	add	%rax, w3
170	adc	%rdx, w0
171	mov	-8(up), %rax
172	adc	$0, R32(w1)
173	sub	$4, n
174	ja	L(top)
175
176L(end):	mul	v1
177	add	w3, (rp)
178	adc	%rax, w0
179	adc	%rdx, w1
180	mov	w0, 8(rp)
181	mov	w1, %rax
182	pop	%rbp
183	pop	%rbx
184	FUNC_EXIT()
185	ret
186EPILOGUE()
187