xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/lshsub_n.asm (revision 230b95665bbd3a9d1a53658a36b1053f8382a519)
1dnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
2
3dnl  Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C AMD K8,K9	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
25C AMD K10	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
26C Intel P4	16.5
27C Intel core2	 4.35
28C Intel corei	 ?
29C Intel atom	 ?
30C VIA nano	 ?
31
32C This was written quickly and not optimized at all, but it runs very well on
33C K8.  But perhaps one could get under 3 c/l.  Ideas:
34C   1) Use indexing to save the 3 LEA
35C   2) Write reasonable feed-in code
36C   3) Be more clever about register usage
37C   4) Unroll more, handling CL negation, carry save/restore cost much now
38C   5) Reschedule
39
40C INPUT PARAMETERS
41define(`rp',	`%rdi')
42define(`up',	`%rsi')
43define(`vp',	`%rdx')
44define(`n',	`%rcx')
45define(`cnt',	`%r8')
46
47ABI_SUPPORT(DOS64)
48ABI_SUPPORT(STD64)
49
50ASM_START()
51	TEXT
52	ALIGN(16)
53PROLOGUE(mpn_lshsub_n)
54	FUNC_ENTRY(4)
55IFDOS(`	mov	56(%rsp), %r8d	')
56
57	push	%r12
58	push	%r13
59	push	%r14
60	push	%r15
61	push	%rbx
62
63	mov	n, %rax
64	xor	R32(%rbx), R32(%rbx)	C clear carry save register
65	mov	R32(%r8), R32(%rcx)	C shift count
66	xor	R32(%r15), R32(%r15)	C limb carry
67
68	mov	R32(%rax), R32(%r11)
69	and	$3, R32(%r11)
70	je	L(4)
71	sub	$1, R32(%r11)
72
73L(oopette):
74	add	R32(%rbx), R32(%rbx)	C restore carry flag
75	mov	0(up), %r8
76	lea	8(up), up
77	sbb	0(vp), %r8
78	mov	%r8, %r12
79	sbb	R32(%rbx), R32(%rbx)	C save carry flag
80	shl	R8(%rcx), %r8
81	or	%r15, %r8
82	mov	%r12, %r15
83	lea	8(vp), vp
84	neg	R8(%rcx)
85	shr	R8(%rcx), %r15
86	neg	R8(%rcx)
87	mov	%r8, 0(rp)
88	lea	8(rp), rp
89	sub	$1, R32(%r11)
90	jnc	L(oopette)
91
92L(4):
93	sub	$4, %rax
94	jc	L(end)
95
96	ALIGN(16)
97L(oop):
98	add	R32(%rbx), R32(%rbx)	C restore carry flag
99
100	mov	0(up), %r8
101	mov	8(up), %r9
102	mov	16(up), %r10
103	mov	24(up), %r11
104
105	lea	32(up), up
106
107	sbb	0(vp), %r8
108	mov	%r8, %r12
109	sbb	8(vp), %r9
110	mov	%r9, %r13
111	sbb	16(vp), %r10
112	mov	%r10, %r14
113	sbb	24(vp), %r11
114
115	sbb	R32(%rbx), R32(%rbx)	C save carry flag
116
117	shl	R8(%rcx), %r8
118	shl	R8(%rcx), %r9
119	shl	R8(%rcx), %r10
120	or	%r15, %r8
121	mov	%r11, %r15
122	shl	R8(%rcx), %r11
123
124	lea	32(vp), vp
125
126	neg	R8(%rcx)
127
128	shr	R8(%rcx), %r12
129	shr	R8(%rcx), %r13
130	shr	R8(%rcx), %r14
131	shr	R8(%rcx), %r15		C used next loop
132
133	or	%r12, %r9
134	or	%r13, %r10
135	or	%r14, %r11
136
137	neg	R8(%rcx)
138
139	mov	%r8, 0(rp)
140	mov	%r9, 8(rp)
141	mov	%r10, 16(rp)
142	mov	%r11, 24(rp)
143
144	lea	32(rp), rp
145
146	sub	$4, %rax
147	jnc	L(oop)
148L(end):
149	neg	R32(%rbx)
150	shl	R8(%rcx), %rbx
151	adc	%r15, %rbx
152	mov	%rbx, %rax
153	pop	%rbx
154	pop	%r15
155	pop	%r14
156	pop	%r13
157	pop	%r12
158
159	FUNC_EXIT()
160	ret
161EPILOGUE()
162