xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/lshsub_n.asm (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1dnl  Intel P6 mpn_lshsub_n -- mpn papillion support.
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
23
24C (1) The loop is is not scheduled in any way, and scheduling attempts have not
25C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
26C     at all wants to use MMX.
27C (2) We could save a register by not alternatingly using eax and edx in the
28C     loop.
29
30define(`rp',	`%edi')
31define(`up',	`%esi')
32define(`vp',	`%ebx')
33define(`n',	`%ecx')
34define(`cnt',	`%mm7')
35
36ASM_START()
37
38	TEXT
39	ALIGN(16)
40
41PROLOGUE(mpn_lshsub_n)
42	push	%edi
43	push	%esi
44	push	%ebx
45
46	mov	16(%esp), rp
47	mov	20(%esp), up
48	mov	24(%esp), vp
49	mov	28(%esp), n
50	mov	$32, %eax
51	sub	32(%esp), %eax
52	movd	%eax, cnt
53
54	lea	(up,n,4), up
55	lea	(vp,n,4), vp
56	lea	(rp,n,4), rp
57
58	neg	n
59	mov	n, %eax
60	and	$-8, n
61	and	$7, %eax
62	shl	%eax				C eax = 2x
63	lea	(%eax,%eax,4), %edx		C edx = 10x
64ifdef(`PIC',`
65	call	L(pic_calc)
66L(here):
67',`
68	lea	L(ent)(%eax,%edx,2), %eax	C eax = 22x
69')
70
71	pxor	%mm1, %mm1
72	pxor	%mm0, %mm0
73
74	jmp	*%eax
75
76ifdef(`PIC',`
77L(pic_calc):
78	C See mpn/x86/README about old gas bugs
79	lea	(%eax,%edx,2), %eax
80	add	$L(ent)-L(here), %eax
81	add	(%esp), %eax
82	ret_internal
83')
84
85L(end):	C compute (cy<<cnt) | (edx>>(32-cnt))
86	sbb	%eax, %eax
87	neg	%eax
88	mov	32(%esp), %ecx
89	shld	%cl, %edx, %eax
90
91	emms
92
93	pop	%ebx
94	pop	%esi
95	pop	%edi
96	ret
97	ALIGN(16)
98L(top):	jecxz	L(end)
99L(ent):	mov	   0(up,n,4), %eax
100	sbb	   0(vp,n,4), %eax
101	movd	   %eax, %mm0
102	punpckldq  %mm0, %mm1
103	psrlq	   %mm7, %mm1
104	movd	   %mm1, 0(rp,n,4)
105
106	mov	   4(up,n,4), %edx
107	sbb	   4(vp,n,4), %edx
108	movd	   %edx, %mm1
109	punpckldq  %mm1, %mm0
110	psrlq	   %mm7, %mm0
111	movd	   %mm0, 4(rp,n,4)
112
113	mov	   8(up,n,4), %eax
114	sbb	   8(vp,n,4), %eax
115	movd	   %eax, %mm0
116	punpckldq  %mm0, %mm1
117	psrlq	   %mm7, %mm1
118	movd	   %mm1, 8(rp,n,4)
119
120	mov	   12(up,n,4), %edx
121	sbb	   12(vp,n,4), %edx
122	movd	   %edx, %mm1
123	punpckldq  %mm1, %mm0
124	psrlq	   %mm7, %mm0
125	movd	   %mm0, 12(rp,n,4)
126
127	mov	   16(up,n,4), %eax
128	sbb	   16(vp,n,4), %eax
129	movd	   %eax, %mm0
130	punpckldq  %mm0, %mm1
131	psrlq	   %mm7, %mm1
132	movd	   %mm1, 16(rp,n,4)
133
134	mov	   20(up,n,4), %edx
135	sbb	   20(vp,n,4), %edx
136	movd	   %edx, %mm1
137	punpckldq  %mm1, %mm0
138	psrlq	   %mm7, %mm0
139	movd	   %mm0, 20(rp,n,4)
140
141	mov	   24(up,n,4), %eax
142	sbb	   24(vp,n,4), %eax
143	movd	   %eax, %mm0
144	punpckldq  %mm0, %mm1
145	psrlq	   %mm7, %mm1
146	movd	   %mm1, 24(rp,n,4)
147
148	mov	   28(up,n,4), %edx
149	sbb	   28(vp,n,4), %edx
150	movd	   %edx, %mm1
151	punpckldq  %mm1, %mm0
152	psrlq	   %mm7, %mm0
153	movd	   %mm0, 28(rp,n,4)
154
155	lea	   8(n), n
156	jmp	   L(top)
157
158EPILOGUE()
159