xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/submul_1.asm (revision 19ef5b5b0bcb90f63509df6e78769de1b57c2758)
1dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C			    cycles/limb
25C P6 model 0-8,10-12		-
26C P6 model 9   (Banias)		6.8
27C P6 model 13  (Dothan)		6.9
28C P4 model 0-1 (Willamette)	?
29C P4 model 2   (Northwood)	5.87
30C P4 model 3-4 (Prescott)	6.5
31
32C This code represents a step forwards compared to the code available before
33C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is
34C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and
35C Prescott compared to the old code.
36C
37C The arrangements made here to get a two instruction dependent chain are
38C slightly subtle.  In the loop the carry (or borrow rather) is a negative so
39C that a paddq can be used to give a low limb ready to store, and a high limb
40C ready to become the new carry after a psrlq.
41C
42C If the carry was a simple twos complement negative then the psrlq shift would
43C need to bring in 0 bits or 1 bits according to whether the high was zero or
44C non-zero, since a non-zero value would represent a negative needing sign
45C extension.  That wouldn't be particularly easy to arrange and certainly would
46C add an instruction to the dependent chain, so instead an offset is applied so
47C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to
48C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
49C always positive and can always have 0 bits shifted in, which is what psrlq
50C does.
51C
52C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
53C done off the dependent chain.  The total adjustment then is to add
54C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
55C to remove the offset from the current carry, for a net add of
56C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when
57C fetched.
58C
59C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
60C negative, which is how it's undone for the return value, but that doesn't
61C seem as clear.
62
63defframe(PARAM_CARRY,     20)
64defframe(PARAM_MULTIPLIER,16)
65defframe(PARAM_SIZE,      12)
66defframe(PARAM_SRC,       8)
67defframe(PARAM_DST,       4)
68
69	TEXT
70	ALIGN(16)
71
72PROLOGUE(mpn_submul_1c)
73deflit(`FRAME',0)
74	movd	PARAM_CARRY, %mm1
75	jmp	L(start_1c)
76EPILOGUE()
77
78PROLOGUE(mpn_submul_1)
79deflit(`FRAME',0)
80	pxor	%mm1, %mm1		C initial borrow
81
82L(start_1c):
83	mov	PARAM_SRC, %eax
84	pcmpeqd	%mm0, %mm0
85
86	movd	PARAM_MULTIPLIER, %mm7
87	pcmpeqd	%mm6, %mm6
88
89	mov	PARAM_DST, %edx
90	psrlq	$32, %mm0		C 0x00000000FFFFFFFF
91
92	mov	PARAM_SIZE, %ecx
93	psllq	$32, %mm6		C 0xFFFFFFFF00000000
94
95	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001
96
97	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow
98
99
100	movd	(%eax), %mm3		C up
101	movd	(%edx), %mm4		C rp
102
103	add	$-1, %ecx
104	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
105	pmuludq	%mm7, %mm3
106	jnz	L(gt1)
107	psubq	%mm3, %mm4		C prod
108	paddq	%mm4, %mm0		C borrow
109	movd	%mm0, (%edx)		C result
110	jmp	L(rt)
111
112L(gt1):	movd	4(%eax), %mm1		C up
113	movd	4(%edx), %mm2		C rp
114
115	add	$-1, %ecx
116	jz	L(eev)
117
118	ALIGN(16)
119L(top):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
120	pmuludq	%mm7, %mm1
121	psubq	%mm3, %mm4		C prod
122	movd	8(%eax), %mm3		C up
123	paddq	%mm4, %mm0		C borrow
124	movd	8(%edx), %mm4		C rp
125	movd	%mm0, (%edx)		C result
126	psrlq	$32, %mm0
127
128	add	$-1, %ecx
129	jz	L(eod)
130
131	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
132	pmuludq	%mm7, %mm3
133	psubq	%mm1, %mm2		C prod
134	movd	12(%eax), %mm1		C up
135	paddq	%mm2, %mm0		C borrow
136	movd	12(%edx), %mm2		C rp
137	movd	%mm0, 4(%edx)		C result
138	psrlq	$32, %mm0
139
140	lea	8(%eax), %eax
141	lea	8(%edx), %edx
142	add	$-1, %ecx
143	jnz	L(top)
144
145
146L(eev):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
147	pmuludq	%mm7, %mm1
148	psubq	%mm3, %mm4		C prod
149	paddq	%mm4, %mm0		C borrow
150	movd	%mm0, (%edx)		C result
151	psrlq	$32, %mm0
152	psubq	%mm1, %mm2		C prod
153	paddq	%mm2, %mm0		C borrow
154	movd	%mm0, 4(%edx)		C result
155L(rt):	psrlq	$32, %mm0
156	movd	%mm0, %eax
157	not	%eax
158	emms
159	ret
160
161L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
162	pmuludq	%mm7, %mm3
163	psubq	%mm1, %mm2		C prod
164	paddq	%mm2, %mm0		C borrow
165	movd	%mm0, 4(%edx)		C result
166	psrlq	$32, %mm0
167	psubq	%mm3, %mm4		C prod
168	paddq	%mm4, %mm0		C borrow
169	movd	%mm0, 8(%edx)		C result
170	jmp	L(rt)
171EPILOGUE()
172