xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/submul_1.asm (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C			    cycles/limb
36C P6 model 0-8,10-12		-
37C P6 model 9   (Banias)		6.8
38C P6 model 13  (Dothan)		6.9
39C P4 model 0-1 (Willamette)	?
40C P4 model 2   (Northwood)	5.87
41C P4 model 3-4 (Prescott)	6.5
42
43C This code represents a step forwards compared to the code available before
44C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is
45C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and
46C Prescott compared to the old code.
47C
48C The arrangements made here to get a two instruction dependent chain are
49C slightly subtle.  In the loop the carry (or borrow rather) is a negative so
50C that a paddq can be used to give a low limb ready to store, and a high limb
51C ready to become the new carry after a psrlq.
52C
53C If the carry was a simple twos complement negative then the psrlq shift would
54C need to bring in 0 bits or 1 bits according to whether the high was zero or
55C non-zero, since a non-zero value would represent a negative needing sign
56C extension.  That wouldn't be particularly easy to arrange and certainly would
57C add an instruction to the dependent chain, so instead an offset is applied so
58C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to
59C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
60C always positive and can always have 0 bits shifted in, which is what psrlq
61C does.
62C
63C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
64C done off the dependent chain.  The total adjustment then is to add
65C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
66C to remove the offset from the current carry, for a net add of
67C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when
68C fetched.
69C
70C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
71C negative, which is how it's undone for the return value, but that doesn't
72C seem as clear.
73
74defframe(PARAM_CARRY,     20)
75defframe(PARAM_MULTIPLIER,16)
76defframe(PARAM_SIZE,      12)
77defframe(PARAM_SRC,       8)
78defframe(PARAM_DST,       4)
79
80	TEXT
81	ALIGN(16)
82
83PROLOGUE(mpn_submul_1c)
84deflit(`FRAME',0)
85	movd	PARAM_CARRY, %mm1
86	jmp	L(start_1c)
87EPILOGUE()
88
89PROLOGUE(mpn_submul_1)
90deflit(`FRAME',0)
91	pxor	%mm1, %mm1		C initial borrow
92
93L(start_1c):
94	mov	PARAM_SRC, %eax
95	pcmpeqd	%mm0, %mm0
96
97	movd	PARAM_MULTIPLIER, %mm7
98	pcmpeqd	%mm6, %mm6
99
100	mov	PARAM_DST, %edx
101	psrlq	$32, %mm0		C 0x00000000FFFFFFFF
102
103	mov	PARAM_SIZE, %ecx
104	psllq	$32, %mm6		C 0xFFFFFFFF00000000
105
106	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001
107
108	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow
109
110
111	movd	(%eax), %mm3		C up
112	movd	(%edx), %mm4		C rp
113
114	add	$-1, %ecx
115	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
116	pmuludq	%mm7, %mm3
117	jnz	L(gt1)
118	psubq	%mm3, %mm4		C prod
119	paddq	%mm4, %mm0		C borrow
120	movd	%mm0, (%edx)		C result
121	jmp	L(rt)
122
123L(gt1):	movd	4(%eax), %mm1		C up
124	movd	4(%edx), %mm2		C rp
125
126	add	$-1, %ecx
127	jz	L(eev)
128
129	ALIGN(16)
130L(top):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
131	pmuludq	%mm7, %mm1
132	psubq	%mm3, %mm4		C prod
133	movd	8(%eax), %mm3		C up
134	paddq	%mm4, %mm0		C borrow
135	movd	8(%edx), %mm4		C rp
136	movd	%mm0, (%edx)		C result
137	psrlq	$32, %mm0
138
139	add	$-1, %ecx
140	jz	L(eod)
141
142	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
143	pmuludq	%mm7, %mm3
144	psubq	%mm1, %mm2		C prod
145	movd	12(%eax), %mm1		C up
146	paddq	%mm2, %mm0		C borrow
147	movd	12(%edx), %mm2		C rp
148	movd	%mm0, 4(%edx)		C result
149	psrlq	$32, %mm0
150
151	lea	8(%eax), %eax
152	lea	8(%edx), %edx
153	add	$-1, %ecx
154	jnz	L(top)
155
156
157L(eev):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
158	pmuludq	%mm7, %mm1
159	psubq	%mm3, %mm4		C prod
160	paddq	%mm4, %mm0		C borrow
161	movd	%mm0, (%edx)		C result
162	psrlq	$32, %mm0
163	psubq	%mm1, %mm2		C prod
164	paddq	%mm2, %mm0		C borrow
165	movd	%mm0, 4(%edx)		C result
166L(rt):	psrlq	$32, %mm0
167	movd	%mm0, %eax
168	not	%eax
169	emms
170	ret
171
172L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
173	pmuludq	%mm7, %mm3
174	psubq	%mm1, %mm2		C prod
175	paddq	%mm2, %mm0		C borrow
176	movd	%mm0, 4(%edx)		C result
177	psrlq	$32, %mm0
178	psubq	%mm3, %mm4		C prod
179	paddq	%mm4, %mm0		C borrow
180	movd	%mm0, 8(%edx)		C result
181	jmp	L(rt)
182EPILOGUE()
183