xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/addlsh1_n.asm (revision a45db23f655e22f0c2354600d3b3c2cb98abf2dc)
1dnl  Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
2
3dnl  Copyright 2001-2004, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C					cycles/limb
35C			     dst!=src1,2  dst==src1  dst==src2
36C P6 model 0-8,10-12		-
37C P6 model 9   (Banias)		?
38C P6 model 13  (Dothan)		?
39C P4 model 0-1 (Willamette)	?
40C P4 model 2   (Northwood)	4.25	     6		6
41C P4 model 3-4 (Prescott)	5	     8.5	8.5
42
43C The slightly strange combination of indexing and pointer incrementing
44C that's used seems to work best.  Not sure why, but %ecx,4 with src1 and/or
45C src2 is a slowdown.
46C
47C The dependent chain is simply the paddq of x+2*y to the previous carry,
48C then psrlq to get the new carry.  That makes 4 c/l the target speed, which
49C is almost achieved for separate src/dst but when src==dst the write
50C combining anomalies slow it down.
51
52defframe(PARAM_SIZE, 16)
53defframe(PARAM_SRC2, 12)
54defframe(PARAM_SRC1, 8)
55defframe(PARAM_DST,  4)
56
57dnl  re-use parameter space
58define(SAVE_EBX,`PARAM_SRC1')
59
60	TEXT
61	ALIGN(8)
62
63PROLOGUE(mpn_addlsh1_n)
64deflit(`FRAME',0)
65
66	mov	PARAM_SRC1, %eax
67	mov	%ebx, SAVE_EBX
68
69	mov	PARAM_SRC2, %ebx
70	pxor	%mm0, %mm0		C initial carry
71
72	mov	PARAM_DST, %edx
73
74	mov	PARAM_SIZE, %ecx
75
76	lea	(%edx,%ecx,4), %edx	C dst end
77	neg	%ecx			C -size
78
79L(top):
80	C eax	src1 end
81	C ebx	src2 end
82	C ecx	counter, limbs, negative
83	C edx	dst end
84	C mm0	carry
85
86	movd	(%ebx), %mm2
87	movd	(%eax), %mm1
88	psrlq	$32, %mm0
89	lea	4(%eax), %eax
90	lea	4(%ebx), %ebx
91
92	psllq	$1, %mm2
93	paddq	%mm2, %mm1
94
95	paddq	%mm1, %mm0
96
97	movd	%mm0, (%edx,%ecx,4)
98	add	$1, %ecx
99	jnz	L(top)
100
101
102	psrlq	$32, %mm0
103	mov	SAVE_EBX, %ebx
104	movd	%mm0, %eax
105	emms
106	ret
107
108EPILOGUE()
109