xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/sse2/submul_1.asm (revision c7c727fae85036860d5bb848f2730ff419e2b060)
1dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon
25C     (stepping 10).
26
27
28C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29C                         mp_limb_t mult);
30C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
31C                          mp_limb_t mult, mp_limb_t carry);
32C
33C This code is not particularly good at 7 c/l.  The dependent chain is only
34C 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that
35C speed isn't achieved.
36C
37C The arrangements made here to get a two instruction dependent chain are
38C slightly subtle.  In the loop the carry (or borrow rather) is a negative
39C so that a paddq can be used to give a low limb ready to store, and a high
40C limb ready to become the new carry after a psrlq.
41C
42C If the carry was a simple twos complement negative then the psrlq shift
43C would need to bring in 0 bits or 1 bits according to whether the high was
44C zero or non-zero, since a non-zero value would represent a negative
45C needing sign extension.  That wouldn't be particularly easy to arrange and
46C certainly would add an instruction to the dependent chain, so instead an
47C offset is applied so that the high limb will be 0xFFFFFFFF+c.  With c in
48C the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to
49C 0xFFFFFFFF and is therefore always positive and can always have 0 bits
50C shifted in, which is what psrlq does.
51C
52C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
53C done off the dependent chain.  The total adjustment then is to add
54C 0xFFFFFFFF00000000 to offset the new carry, and subtract
55C 0x00000000FFFFFFFF to remove the offset from the current carry, for a net
56C add of 0xFFFFFFFE00000001.  In the code this is applied to the destination
57C limb when fetched.
58C
59C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
60C negative, which is how it's undone for the return value, but that doesn't
61C seem as clear.
62
63defframe(PARAM_CARRY,     20)
64defframe(PARAM_MULTIPLIER,16)
65defframe(PARAM_SIZE,      12)
66defframe(PARAM_SRC,       8)
67defframe(PARAM_DST,       4)
68
69	TEXT
70	ALIGN(16)
71
72PROLOGUE(mpn_submul_1c)
73deflit(`FRAME',0)
74	movd	PARAM_CARRY, %mm1
75	jmp	L(start_1c)
76EPILOGUE()
77
78PROLOGUE(mpn_submul_1)
79deflit(`FRAME',0)
80	pxor	%mm1, %mm1		C initial borrow
81
82L(start_1c):
83	movl	PARAM_SRC, %eax
84	pcmpeqd	%mm0, %mm0
85
86	movd	PARAM_MULTIPLIER, %mm7
87	pcmpeqd	%mm6, %mm6
88
89	movl	PARAM_DST, %edx
90	psrlq	$32, %mm0		C 0x00000000FFFFFFFF
91
92	movl	PARAM_SIZE, %ecx
93	psllq	$32, %mm6		C 0xFFFFFFFF00000000
94
95	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001
96
97	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow
98
99
100	C eax	src, incrementing
101	C ebx
102	C ecx	loop counter, decrementing
103	C edx	dst, incrementing
104	C
105	C mm0	0xFFFFFFFF - borrow
106	C mm6	0xFFFFFFFE00000001
107	C mm7	multiplier
108
109L(loop):
110	movd	(%eax), %mm1		C src
111	leal	4(%eax), %eax
112	movd	(%edx), %mm2		C dst
113	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
114	pmuludq	%mm7, %mm1
115	psubq	%mm1, %mm2		C prod
116	paddq	%mm2, %mm0		C borrow
117	subl	$1, %ecx
118	movd	%mm0, (%edx)		C result
119	psrlq	$32, %mm0
120	leal	4(%edx), %edx
121	jnz	L(loop)
122
123	movd	%mm0, %eax
124	notl	%eax
125	emms
126	ret
127
128EPILOGUE()
129