xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/copyd.asm (revision a24efa7dea9f1f56c3bdb15a927d3516792ace1c)
1dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	    cycles/limb		  good for cpu?
24C AMD K8,K9
25C AMD K10	 0.85			Y
26C AMD bd1	 0.8			Y
27C AMD bobcat
28C Intel P4	 2.28			Y
29C Intel core2	 1
30C Intel NHM	 0.5			Y
31C Intel SBR	 0.5			Y
32C Intel atom
33C VIA nano	 1.1			Y
34
35C We try to do as many 16-byte operations as possible.  The top-most and
36C bottom-most writes might need 8-byte operations.  We can always write using
37C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
38C operations.
39
40C Instead of having separate loops for reading aligned and unaligned, we read
41C using MOVDQU.  This seems to work great except for core2; there performance
42C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
43C best handle the unaligned case there.
44
45C INPUT PARAMETERS
46define(`rp', `%rdi')
47define(`up', `%rsi')
48define(`n',  `%rdx')
49
50ABI_SUPPORT(DOS64)
51ABI_SUPPORT(STD64)
52
53ASM_START()
54	TEXT
55	ALIGN(16)
56PROLOGUE(mpn_copyd)
57	FUNC_ENTRY(3)
58
59	test	n, n
60	jz	L(don)
61
62	lea	-16(rp,n,8), rp
63	lea	-16(up,n,8), up
64
65	test	$8, R8(rp)		C is rp 16-byte aligned?
66	jz	L(ali)			C jump if rp aligned
67	mov	8(up), %rax
68	lea	-8(up), up
69	mov	%rax, 8(rp)
70	lea	-8(rp), rp
71	dec	n
72
73	sub	$16, n
74	jc	L(sma)
75
76	ALIGN(16)
77L(top):	movdqu	(up), %xmm0
78	movdqu	-16(up), %xmm1
79	movdqu	-32(up), %xmm2
80	movdqu	-48(up), %xmm3
81	movdqu	-64(up), %xmm4
82	movdqu	-80(up), %xmm5
83	movdqu	-96(up), %xmm6
84	movdqu	-112(up), %xmm7
85	lea	-128(up), up
86	movdqa	%xmm0, (rp)
87	movdqa	%xmm1, -16(rp)
88	movdqa	%xmm2, -32(rp)
89	movdqa	%xmm3, -48(rp)
90	movdqa	%xmm4, -64(rp)
91	movdqa	%xmm5, -80(rp)
92	movdqa	%xmm6, -96(rp)
93	movdqa	%xmm7, -112(rp)
94	lea	-128(rp), rp
95L(ali):	sub	$16, n
96	jnc	L(top)
97
98L(sma):	test	$8, R8(n)
99	jz	1f
100	movdqu	(up), %xmm0
101	movdqu	-16(up), %xmm1
102	movdqu	-32(up), %xmm2
103	movdqu	-48(up), %xmm3
104	lea	-64(up), up
105	movdqa	%xmm0, (rp)
106	movdqa	%xmm1, -16(rp)
107	movdqa	%xmm2, -32(rp)
108	movdqa	%xmm3, -48(rp)
109	lea	-64(rp), rp
1101:
111	test	$4, R8(n)
112	jz	1f
113	movdqu	(up), %xmm0
114	movdqu	-16(up), %xmm1
115	lea	-32(up), up
116	movdqa	%xmm0, (rp)
117	movdqa	%xmm1, -16(rp)
118	lea	-32(rp), rp
1191:
120	test	$2, R8(n)
121	jz	1f
122	movdqu	(up), %xmm0
123	lea	-16(up), up
124	movdqa	%xmm0, (rp)
125	lea	-16(rp), rp
1261:
127	test	$1, R8(n)
128	jz	1f
129	mov	8(up), %r8
130	mov	%r8, 8(rp)
1311:
132L(don):	FUNC_EXIT()
133	ret
134EPILOGUE()
135