xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/copyd-palignr.asm (revision dd255ccea4286b0c44fa8fd48a9a19a768afe8e1)
1dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
2
3dnl  Copyright 2012 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb     cycles/limb     cycles/limb      good
25C              aligned	      unaligned	      best seen	     for cpu?
26C AMD K8,K9	 2.0		 illop		1.0/1.0		N
27C AMD K10	 0.85		 illop				Y/N
28C AMD bd1	 1.39		 1.40				Y
29C AMD bobcat	 1.97		 8.35		1.5/1.5		N
30C Intel P4	 2.26		 illop				Y/N
31C Intel core2	 0.52		0.68-0.80	opt/0.68	Y
32C Intel NHM	 0.52		 0.64		opt/opt		Y
33C Intel SBR	 0.51		 0.54		opt/0.51	Y
34C Intel atom	 1.16		 1.66		opt/opt		Y
35C VIA nano	 1.09		 1.07		opt/opt		Y
36
37C We use only 16-byte operations, except for unaligned top-most and bottom-most
38C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
39C
40C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
41C taken from the x86_64 default code.
42
43C INPUT PARAMETERS
44define(`rp', `%rdi')
45define(`up', `%rsi')
46define(`n',  `%rdx')
47
48C There are three instructions for loading an aligned 128-bit quantity.  We use
49C movaps, since it has the shortest coding.
50define(`movdqa', ``movaps'')
51
52ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
53
54ASM_START()
55	TEXT
56	ALIGN(64)
57PROLOGUE(mpn_copyd)
58	FUNC_ENTRY(3)
59
60	lea	-8(up,n,8), up
61	lea	-8(rp,n,8), rp
62
63	cmp	$COPYD_SSE_THRESHOLD, n
64	jbe	L(bc)
65
66	bt	$3, R32(rp)		C is rp 16-byte aligned?
67	jc	L(rp_aligned)		C jump if rp aligned
68
69	mov	(up), %rax		C copy one limb
70	mov	%rax, (rp)
71	lea	-8(up), up
72	lea	-8(rp), rp
73	dec	n
74
75L(rp_aligned):
76	bt	$3, R32(up)
77	jnc	L(uent)
78
79ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
80`	sub	$8, n',
81`	jmp	L(am)')
82
83	ALIGN(16)
84L(atop):movdqa	-8(up), %xmm0
85	movdqa	-24(up), %xmm1
86	movdqa	-40(up), %xmm2
87	movdqa	-56(up), %xmm3
88	lea	-64(up), up
89	movdqa	%xmm0, -8(rp)
90	movdqa	%xmm1, -24(rp)
91	movdqa	%xmm2, -40(rp)
92	movdqa	%xmm3, -56(rp)
93	lea	-64(rp), rp
94L(am):	sub	$8, n
95	jnc	L(atop)
96
97	bt	$2, R32(n)
98	jnc	1f
99	movdqa	-8(up), %xmm0
100	movdqa	-24(up), %xmm1
101	lea	-32(up), up
102	movdqa	%xmm0, -8(rp)
103	movdqa	%xmm1, -24(rp)
104	lea	-32(rp), rp
105
1061:	bt	$1, R32(n)
107	jnc	1f
108	movdqa	-8(up), %xmm0
109	lea	-16(up), up
110	movdqa	%xmm0, -8(rp)
111	lea	-16(rp), rp
112
1131:	bt	$0, n
114	jnc	1f
115	mov	(up), %r8
116	mov	%r8, (rp)
117
1181:	FUNC_EXIT()
119	ret
120
121L(uent):sub	$16, n
122	movdqa	(up), %xmm0
123	jc	L(uend)
124
125	ALIGN(16)
126L(utop):sub	$16, n
127	movdqa	-16(up), %xmm1
128	palignr($8, %xmm1, %xmm0)
129	movdqa	%xmm0, -8(rp)
130	movdqa	-32(up), %xmm2
131	palignr($8, %xmm2, %xmm1)
132	movdqa	%xmm1, -24(rp)
133	movdqa	-48(up), %xmm3
134	palignr($8, %xmm3, %xmm2)
135	movdqa	%xmm2, -40(rp)
136	movdqa	-64(up), %xmm0
137	palignr($8, %xmm0, %xmm3)
138	movdqa	%xmm3, -56(rp)
139	movdqa	-80(up), %xmm1
140	palignr($8, %xmm1, %xmm0)
141	movdqa	%xmm0, -72(rp)
142	movdqa	-96(up), %xmm2
143	palignr($8, %xmm2, %xmm1)
144	movdqa	%xmm1, -88(rp)
145	movdqa	-112(up), %xmm3
146	palignr($8, %xmm3, %xmm2)
147	movdqa	%xmm2, -104(rp)
148	movdqa	-128(up), %xmm0
149	palignr($8, %xmm0, %xmm3)
150	movdqa	%xmm3, -120(rp)
151	lea	-128(up), up
152	lea	-128(rp), rp
153	jnc	L(utop)
154
155L(uend):bt	$3, R32(n)
156	jnc	1f
157	movdqa	-16(up), %xmm1
158	palignr($8, %xmm1, %xmm0)
159	movdqa	%xmm0, -8(rp)
160	movdqa	-32(up), %xmm0
161	palignr($8, %xmm0, %xmm1)
162	movdqa	%xmm1, -24(rp)
163	movdqa	-48(up), %xmm1
164	palignr($8, %xmm1, %xmm0)
165	movdqa	%xmm0, -40(rp)
166	movdqa	-64(up), %xmm0
167	palignr($8, %xmm0, %xmm1)
168	movdqa	%xmm1, -56(rp)
169	lea	-64(up), up
170	lea	-64(rp), rp
171
1721:	bt	$2, R32(n)
173	jnc	1f
174	movdqa	-16(up), %xmm1
175	palignr($8, %xmm1, %xmm0)
176	movdqa	%xmm0, -8(rp)
177	movdqa	-32(up), %xmm0
178	palignr($8, %xmm0, %xmm1)
179	movdqa	%xmm1, -24(rp)
180	lea	-32(up), up
181	lea	-32(rp), rp
182
1831:	bt	$1, R32(n)
184	jnc	1f
185	movdqa	-16(up), %xmm1
186	palignr($8, %xmm1, %xmm0)
187	movdqa	%xmm0, -8(rp)
188	lea	-16(up), up
189	lea	-16(rp), rp
190
1911:	bt	$0, n
192	jnc	1f
193	mov	(up), %r8
194	mov	%r8, (rp)
195
1961:	FUNC_EXIT()
197	ret
198
199C Basecase code.  Needed for good small operands speed, not for
200C correctness as the above code is currently written.
201
202L(bc):	sub	$4, R32(n)
203	jc	L(end)
204
205	ALIGN(16)
206L(top):	mov	(up), %r8
207	mov	-8(up), %r9
208	lea	-32(rp), rp
209	mov	-16(up), %r10
210	mov	-24(up), %r11
211	lea	-32(up), up
212	mov	%r8, 32(rp)
213	mov	%r9, 24(rp)
214ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
215`	sub	$4, R32(n)')
216	mov	%r10, 16(rp)
217	mov	%r11, 8(rp)
218ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
219`	jnc	L(top)')
220
221L(end):	bt	$0, R32(n)
222	jnc	1f
223	mov	(up), %r8
224	mov	%r8, (rp)
225	lea	-8(rp), rp
226	lea	-8(up), up
2271:	bt	$1, R32(n)
228	jnc	1f
229	mov	(up), %r8
230	mov	-8(up), %r9
231	mov	%r8, (rp)
232	mov	%r9, -8(rp)
2331:	FUNC_EXIT()
234	ret
235EPILOGUE()
236