xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/copyi.asm (revision d11b170b9000ada93db553723522a63d5deac310)
1dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	    cycles/limb		  good for cpu?
24C AMD K8,K9
25C AMD K10	 0.85	 1.64		Y/N
26C AMD bd1	 1.4	 1.4		Y
27C AMD bobcat
28C Intel P4	 2.3	 2.3		Y
29C Intel core2	 1.0	 1.0
30C Intel NHM	 0.5	 0.67		Y
31C Intel SBR	 0.5	 0.75		Y
32C Intel atom
33C VIA nano	 1.16	 5.16		Y/N
34
35C We try to do as many 16-byte operations as possible.  The top-most and
36C bottom-most writes might need 8-byte operations.  We can always write using
37C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
38C operations.
39
40C Instead of having separate loops for reading aligned and unaligned, we read
41C using MOVDQU.  This seems to work great except for core2; there performance
42C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
43C best handle the unaligned case there.
44
45C INPUT PARAMETERS
46define(`rp', `%rdi')
47define(`up', `%rsi')
48define(`n',  `%rdx')
49
50ABI_SUPPORT(DOS64)
51ABI_SUPPORT(STD64)
52
53dnl define(`movdqu', lddqu)
54
55ASM_START()
56	TEXT
57	ALIGN(64)
58PROLOGUE(mpn_copyi)
59	FUNC_ENTRY(3)
60
61	cmp	$3, n
62	jc	L(bc)
63
64	test	$8, R8(rp)		C is rp 16-byte aligned?
65	jz	L(ali)			C jump if rp aligned
66	movsq				C copy single limb
67	dec	n
68
69	sub	$16, n
70	jc	L(sma)
71
72	ALIGN(16)
73L(top):	movdqu	(up), %xmm0
74	movdqu	16(up), %xmm1
75	movdqu	32(up), %xmm2
76	movdqu	48(up), %xmm3
77	movdqu	64(up), %xmm4
78	movdqu	80(up), %xmm5
79	movdqu	96(up), %xmm6
80	movdqu	112(up), %xmm7
81	lea	128(up), up
82	movdqa	%xmm0, (rp)
83	movdqa	%xmm1, 16(rp)
84	movdqa	%xmm2, 32(rp)
85	movdqa	%xmm3, 48(rp)
86	movdqa	%xmm4, 64(rp)
87	movdqa	%xmm5, 80(rp)
88	movdqa	%xmm6, 96(rp)
89	movdqa	%xmm7, 112(rp)
90	lea	128(rp), rp
91L(ali):	sub	$16, n
92	jnc	L(top)
93
94L(sma):	test	$8, R8(n)
95	jz	1f
96	movdqu	(up), %xmm0
97	movdqu	16(up), %xmm1
98	movdqu	32(up), %xmm2
99	movdqu	48(up), %xmm3
100	lea	64(up), up
101	movdqa	%xmm0, (rp)
102	movdqa	%xmm1, 16(rp)
103	movdqa	%xmm2, 32(rp)
104	movdqa	%xmm3, 48(rp)
105	lea	64(rp), rp
1061:
107	test	$4, R8(n)
108	jz	1f
109	movdqu	(up), %xmm0
110	movdqu	16(up), %xmm1
111	lea	32(up), up
112	movdqa	%xmm0, (rp)
113	movdqa	%xmm1, 16(rp)
114	lea	32(rp), rp
1151:
116	test	$2, R8(n)
117	jz	1f
118	movdqu	(up), %xmm0
119	lea	16(up), up
120	movdqa	%xmm0, (rp)
121	lea	16(rp), rp
122	ALIGN(16)
1231:
124L(end):	bt	$0, n
125	jnc	1f
126	mov	(up), %r8
127	mov	%r8, (rp)
1281:
129	FUNC_EXIT()
130	ret
131
132C Basecase code.  Needed for good small operands speed, not for
133C correctness as the above code is currently written.
134
135L(bc):	sub	$2, n
136	jc	L(end)
137	ALIGN(16)
1381:	mov	(up), %rax
139	mov	8(up), %rcx
140	lea	16(up), up
141	mov	%rax, (rp)
142	mov	%rcx, 8(rp)
143	lea	16(rp), rp
144	sub	$2, n
145	jnc	1b
146
147	bt	$0, n
148	jnc	L(ret)
149	mov	(up), %rax
150	mov	%rax, (rp)
151L(ret):	FUNC_EXIT()
152	ret
153EPILOGUE()
154