xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/rsh1add_n.asm (revision f14316bcbc544b96a93e884bc5c2b15fd60e22ae)
1dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
2
3dnl  Copyright 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C        cycles/limb (approx)
24C      dst!=src1,2  dst==src1  dst==src2
25C P4:      4.5         6.5        6.5
26
27
28C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
29C                          mp_size_t size);
30C
31C The slightly strange combination of indexing and pointer incrementing
32C that's used seems to work best.  Not sure why, but for instance leal
33C incrementing on %esi is a 1 or 2 cycle slowdown.
34C
35C The dependent chain is paddq combining the carry and next (shifted) part,
36C plus psrlq to move the new carry down.  That, and just 4 mmx instructions
37C in total, makes 4 c/l the target speed, which is almost achieved for
38C separate src/dst but when src==dst the write combining anomalies slow it
39C down.
40
41defframe(PARAM_SIZE, 16)
42defframe(PARAM_YP,   12)
43defframe(PARAM_XP,   8)
44defframe(PARAM_WP,   4)
45
46dnl  re-use parameter space
47define(SAVE_EBX,`PARAM_XP')
48define(SAVE_ESI,`PARAM_YP')
49
50	TEXT
51	ALIGN(8)
52
53PROLOGUE(mpn_rsh1add_n)
54deflit(`FRAME',0)
55
56	movl	PARAM_XP, %edx
57	movl	%ebx, SAVE_EBX
58
59	movl	PARAM_YP, %ebx
60	movl	%esi, SAVE_ESI
61
62	movl	PARAM_WP, %esi
63
64	movd	(%edx), %mm0		C xp[0]
65
66	movd	(%ebx), %mm1		C yp[0]
67	movl	PARAM_SIZE, %ecx
68
69	movl	(%edx), %eax		C xp[0]
70
71	addl	(%ebx), %eax		C xp[0]+yp[0]
72
73	paddq	%mm1, %mm0		C xp[0]+yp[0]
74	leal	(%esi,%ecx,4), %esi	C wp end
75	negl	%ecx			C -size
76
77	psrlq	$1, %mm0		C (xp[0]+yp[0])/2
78	and	$1, %eax		C return value, rsh1 bit of xp[0]+yp[0]
79	addl	$1, %ecx		C -(size-1)
80	jz	L(done)
81
82
83L(top):
84	C eax	return value
85	C ebx	yp end
86	C ecx	counter, limbs, -(size-1) to -1 inclusive
87	C edx	xp end
88	C esi	wp end
89	C mm0	carry (32 bits)
90
91	movd	4(%edx), %mm1	C xp[i+1]
92	movd	4(%ebx), %mm2	C yp[i+1]
93	leal	4(%edx), %edx
94	leal	4(%ebx), %ebx
95	paddq	%mm2, %mm1		C xp[i+1]+yp[i+1]
96	psllq	$31, %mm1		C low bit at 31, further 32 above
97
98	paddq	%mm1, %mm0		C 31 and carry from prev add
99	movd	%mm0, -4(%esi,%ecx,4)	C low ready to store dst[i]
100
101	psrlq	$32, %mm0		C high becomes new carry
102
103	addl	$1, %ecx
104	jnz	L(top)
105
106
107L(done):
108	movd	%mm0, -4(%esi)		C dst[size-1]
109	movl	SAVE_EBX, %ebx
110
111	movl	SAVE_ESI, %esi
112	emms
113	ret
114
115EPILOGUE()
116