xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/sqr_diag_addlsh1.asm (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1dnl  AMD64 mpn_sqr_diag_addlsh1
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 2.5
37C AMD K10	 2.5
38C AMD bull	 3.6
39C AMD pile	 3.6
40C AMD steam	 ?
41C AMD bobcat	 4
42C AMD jaguar	 ?
43C Intel P4	 11.5
44C Intel core	 4
45C Intel NHM	 3.6
46C Intel SBR	 3.15
47C Intel IBR	 3.0
48C Intel HWL	 2.6
49C Intel BWL	 ?
50C Intel atom	14
51C VIA nano	 3.5
52
53C When playing with pointers, set this to $2 to fall back to conservative
54C indexing in wind-down code.
55define(`I',`$1')
56
57define(`rp',     `%rdi')
58define(`tp',     `%rsi')
59define(`up_arg', `%rdx')
60define(`n',      `%rcx')
61
62define(`up',     `%r11')
63
64ABI_SUPPORT(DOS64)
65ABI_SUPPORT(STD64)
66
67ASM_START()
68	TEXT
69	ALIGN(32)
70PROLOGUE(mpn_sqr_diag_addlsh1)
71	FUNC_ENTRY(4)
72	push	%rbx
73
74	dec	n
75	shl	n
76
77	mov	(up_arg), %rax
78
79	lea	(rp,n,8), rp
80	lea	(tp,n,8), tp
81	lea	(up_arg,n,4), up
82	neg	n
83
84	mul	%rax
85	mov	%rax, (rp,n,8)
86
87	xor	R32(%rbx), R32(%rbx)
88	jmp	L(mid)
89
90	ALIGN(16)
91L(top):	add	%r10, %r8
92	adc	%rax, %r9
93	mov	%r8, -8(rp,n,8)
94	mov	%r9, (rp,n,8)
95L(mid):	mov	8(up,n,4), %rax
96	mov	(tp,n,8), %r8
97	mov	8(tp,n,8), %r9
98	adc	%r8, %r8
99	adc	%r9, %r9
100	lea	(%rdx,%rbx), %r10
101	setc	R8(%rbx)
102	mul	%rax
103	add	$2, n
104	js	L(top)
105
106L(end):	add	%r10, %r8
107	adc	%rax, %r9
108	mov	%r8, I(-8(rp),-8(rp,n,8))
109	mov	%r9, I((rp),(rp,n,8))
110	adc	%rbx, %rdx
111	mov	%rdx, I(8(rp),8(rp,n,8))
112
113	pop	%rbx
114	FUNC_EXIT()
115	ret
116EPILOGUE()
117