xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/copyd.asm (revision 06dfa8449cb5e76c0044ec0f3badf7d5180af0f5)
1dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
4dnl  Inc.
5
6dnl  Contributed to the GNU project by Torbjörn Granlund.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C	     cycles/limb     cycles/limb     cycles/limb      good
37C              aligned	      unaligned	      best seen	     for cpu?
38C AMD K8,K9
39C AMD K10	 0.85		 1.64				Y/N
40C AMD bull	 1.4		 1.4				Y
41C AMD pile	 0.68		 0.98				Y/N
42C AMD steam
43C AMD excavator
44C AMD bobcat
45C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
46C Intel P4	 2.3		 2.3				Y
47C Intel core	 1.0		 1.0		0.52/0.80	N
48C Intel NHM	 0.5		 0.67				Y
49C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
50C Intel IBR	 0.50		 0.57		opt/0.50	Y
51C Intel HWL	 0.50		 0.57		opt/0.51	Y
52C Intel BWL	 0.55		 0.62		opt/0.55	Y
53C Intel atom
54C Intel SLM	 1.02		 1.27		opt/1.04	Y/N
55C VIA nano	 1.16		 5.16				Y/N
56
57C We try to do as many 16-byte operations as possible.  The top-most and
58C bottom-most writes might need 8-byte operations.  We can always write using
59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
60C operations.
61
62C Instead of having separate loops for reading aligned and unaligned, we read
63C using MOVDQU.  This seems to work great except for core2; there performance
64C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
65C best handle the unaligned case there.
66
67C INPUT PARAMETERS
68define(`rp', `%rdi')
69define(`up', `%rsi')
70define(`n',  `%rdx')
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75dnl define(`movdqu', lddqu)
76
77ASM_START()
78	TEXT
79	ALIGN(16)
80PROLOGUE(mpn_copyd)
81	FUNC_ENTRY(3)
82
83	test	n, n
84	jz	L(don)
85
86	lea	-16(rp,n,8), rp
87	lea	-16(up,n,8), up
88
89	test	$8, R8(rp)		C is rp 16-byte aligned?
90	jz	L(ali)			C jump if rp aligned
91	mov	8(up), %rax
92	lea	-8(up), up
93	mov	%rax, 8(rp)
94	lea	-8(rp), rp
95	dec	n
96
97L(ali):	sub	$16, n
98	jc	L(sma)
99
100IFDOS(`	add	$-56, %rsp	')
101IFDOS(`	movdqa	%xmm6, (%rsp)	')
102IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
103
104	ALIGN(16)
105L(top):	movdqu	(up), %xmm0
106	movdqu	-16(up), %xmm1
107	movdqu	-32(up), %xmm2
108	movdqu	-48(up), %xmm3
109	movdqu	-64(up), %xmm4
110	movdqu	-80(up), %xmm5
111	movdqu	-96(up), %xmm6
112	movdqu	-112(up), %xmm7
113	lea	-128(up), up
114	movdqa	%xmm0, (rp)
115	movdqa	%xmm1, -16(rp)
116	movdqa	%xmm2, -32(rp)
117	movdqa	%xmm3, -48(rp)
118	movdqa	%xmm4, -64(rp)
119	movdqa	%xmm5, -80(rp)
120	movdqa	%xmm6, -96(rp)
121	movdqa	%xmm7, -112(rp)
122	lea	-128(rp), rp
123	sub	$16, n
124	jnc	L(top)
125
126IFDOS(`	movdqa	(%rsp), %xmm6	')
127IFDOS(`	movdqa	16(%rsp), %xmm7	')
128IFDOS(`	add	$56, %rsp	')
129
130L(sma):	test	$8, R8(n)
131	jz	1f
132	movdqu	(up), %xmm0
133	movdqu	-16(up), %xmm1
134	movdqu	-32(up), %xmm2
135	movdqu	-48(up), %xmm3
136	lea	-64(up), up
137	movdqa	%xmm0, (rp)
138	movdqa	%xmm1, -16(rp)
139	movdqa	%xmm2, -32(rp)
140	movdqa	%xmm3, -48(rp)
141	lea	-64(rp), rp
1421:
143	test	$4, R8(n)
144	jz	1f
145	movdqu	(up), %xmm0
146	movdqu	-16(up), %xmm1
147	lea	-32(up), up
148	movdqa	%xmm0, (rp)
149	movdqa	%xmm1, -16(rp)
150	lea	-32(rp), rp
1511:
152	test	$2, R8(n)
153	jz	1f
154	movdqu	(up), %xmm0
155	lea	-16(up), up
156	movdqa	%xmm0, (rp)
157	lea	-16(rp), rp
1581:
159	test	$1, R8(n)
160	jz	1f
161	mov	8(up), %r8
162	mov	%r8, 8(rp)
1631:
164L(don):	FUNC_EXIT()
165	ret
166EPILOGUE()
167