xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/com.asm (revision 9573673d78c64ea1eac42d7f2e9521be89932ae5)
1dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb     cycles/limb     cycles/limb      good
25C              aligned	      unaligned	      best seen	     for cpu?
26C AMD K8,K9	 2.0		 2.0				N
27C AMD K10	 0.85		 1.3				Y/N
28C AMD bd1	 1.40		 1.40				Y
29C AMD bobcat	 3.1		 3.1				N
30C Intel P4	 2.28		 illop				Y
31C Intel core2	 1.02		 1.02				N
32C Intel NHM	 0.53		 0.68				Y
33C Intel SBR	 0.51		 0.75				Y
34C Intel atom	 3.68		 3.68				N
35C VIA nano	 1.17		 5.09				Y/N
36
37C We try to do as many 16-byte operations as possible.  The top-most and
38C bottom-most writes might need 8-byte operations.  We can always write using
39C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
40C operations.
41
42C Instead of having separate loops for reading aligned and unaligned, we read
43C using MOVDQU.  This seems to work great except for core2; there performance
44C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
45C best handle the unaligned case there.
46
47C INPUT PARAMETERS
48define(`rp', `%rdi')
49define(`up', `%rsi')
50define(`n',  `%rdx')
51
52ABI_SUPPORT(DOS64)
53ABI_SUPPORT(STD64)
54
55ASM_START()
56	TEXT
57	ALIGN(16)
58PROLOGUE(mpn_com)
59	FUNC_ENTRY(3)
60
61	test	n, n
62	jz	L(don)
63
64	pcmpeqb	%xmm7, %xmm7		C set to 111...111
65
66	test	$8, R8(rp)		C is rp 16-byte aligned?
67	jz	L(ali)			C jump if rp aligned
68	mov	(up), %rax
69	lea	8(up), up
70	not	%rax
71	mov	%rax, (rp)
72	lea	8(rp), rp
73	dec	n
74
75	sub	$14, n
76	jc	L(sma)
77
78	ALIGN(16)
79L(top):	movdqu	(up), %xmm0
80	movdqu	16(up), %xmm1
81	movdqu	32(up), %xmm2
82	movdqu	48(up), %xmm3
83	movdqu	64(up), %xmm4
84	movdqu	80(up), %xmm5
85	movdqu	96(up), %xmm6
86	lea	112(up), up
87	pxor	%xmm7, %xmm0
88	pxor	%xmm7, %xmm1
89	pxor	%xmm7, %xmm2
90	pxor	%xmm7, %xmm3
91	pxor	%xmm7, %xmm4
92	pxor	%xmm7, %xmm5
93	pxor	%xmm7, %xmm6
94	movdqa	%xmm0, (rp)
95	movdqa	%xmm1, 16(rp)
96	movdqa	%xmm2, 32(rp)
97	movdqa	%xmm3, 48(rp)
98	movdqa	%xmm4, 64(rp)
99	movdqa	%xmm5, 80(rp)
100	movdqa	%xmm6, 96(rp)
101	lea	112(rp), rp
102L(ali):	sub	$14, n
103	jnc	L(top)
104
105L(sma):	add	$14, n
106	test	$8, R8(n)
107	jz	1f
108	movdqu	(up), %xmm0
109	movdqu	16(up), %xmm1
110	movdqu	32(up), %xmm2
111	movdqu	48(up), %xmm3
112	lea	64(up), up
113	pxor	%xmm7, %xmm0
114	pxor	%xmm7, %xmm1
115	pxor	%xmm7, %xmm2
116	pxor	%xmm7, %xmm3
117	movdqa	%xmm0, (rp)
118	movdqa	%xmm1, 16(rp)
119	movdqa	%xmm2, 32(rp)
120	movdqa	%xmm3, 48(rp)
121	lea	64(rp), rp
1221:
123	test	$4, R8(n)
124	jz	1f
125	movdqu	(up), %xmm0
126	movdqu	16(up), %xmm1
127	lea	32(up), up
128	pxor	%xmm7, %xmm0
129	pxor	%xmm7, %xmm1
130	movdqa	%xmm0, (rp)
131	movdqa	%xmm1, 16(rp)
132	lea	32(rp), rp
1331:
134	test	$2, R8(n)
135	jz	1f
136	movdqu	(up), %xmm0
137	lea	16(up), up
138	pxor	%xmm7, %xmm0
139	movdqa	%xmm0, (rp)
140	lea	16(rp), rp
1411:
142	test	$1, R8(n)
143	jz	1f
144	mov	(up), %rax
145	not	%rax
146	mov	%rax, (rp)
1471:
148L(don):	FUNC_EXIT()
149	ret
150EPILOGUE()
151