xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/copyi-palignr.asm (revision 92e958de60c71aa0f2452bd7074cbb006fe6546b)
1dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
2
3dnl  Copyright 2012 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb     cycles/limb     cycles/limb      good
25C              aligned	      unaligned	      best seen	     for cpu?
26C AMD K8,K9	 2.0		 illop		1.0/1.0		N
27C AMD K10	 0.85		 illop				Y/N
28C AMD bd1	 1.39		 ? 1.45				Y/N
29C AMD bobcat	 1.97		 ? 8.17		1.5/1.5		N
30C Intel P4	 2.26		 illop				Y/N
31C Intel core2	 0.52		 0.82		opt/0.74	Y
32C Intel NHM	 0.52		 0.65		opt/opt		Y
33C Intel SBR	 0.51		 0.55		opt/0.51	Y
34C Intel atom	 1.16		 1.70		opt/opt		Y
35C VIA nano	 1.09		 1.10		opt/opt		Y
36
37C We use only 16-byte operations, except for unaligned top-most and bottom-most
38C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
39C instruction is better adapted to mpn_copyd's needs, we need to contort the
40C code to use it here.
41C
42C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
43C taken from the x86_64 default code.
44
45C INPUT PARAMETERS
46define(`rp', `%rdi')
47define(`up', `%rsi')
48define(`n',  `%rdx')
49
50C There are three instructions for loading an aligned 128-bit quantity.  We use
51C movaps, since it has the shortest coding.
52define(`movdqa', ``movaps'')
53
54ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
55
56ASM_START()
57	TEXT
58	ALIGN(64)
59PROLOGUE(mpn_copyi)
60	FUNC_ENTRY(3)
61
62	cmp	$COPYI_SSE_THRESHOLD, n
63	jbe	L(bc)
64
65	bt	$3, R32(rp)		C is rp 16-byte aligned?
66	jnc	L(rp_aligned)		C jump if rp aligned
67
68	movsq				C copy one limb
69	dec	n
70
71L(rp_aligned):
72	bt	$3, R32(up)
73	jc	L(uent)
74
75ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
76`	sub	$8, n',
77`	jmp	L(am)')
78
79	ALIGN(16)
80L(atop):movdqa	0(up), %xmm0
81	movdqa	16(up), %xmm1
82	movdqa	32(up), %xmm2
83	movdqa	48(up), %xmm3
84	lea	64(up), up
85	movdqa	%xmm0, (rp)
86	movdqa	%xmm1, 16(rp)
87	movdqa	%xmm2, 32(rp)
88	movdqa	%xmm3, 48(rp)
89	lea	64(rp), rp
90L(am):	sub	$8, n
91	jnc	L(atop)
92
93	bt	$2, R32(n)
94	jnc	1f
95	movdqa	(up), %xmm0
96	movdqa	16(up), %xmm1
97	lea	32(up), up
98	movdqa	%xmm0, (rp)
99	movdqa	%xmm1, 16(rp)
100	lea	32(rp), rp
101
1021:	bt	$1, R32(n)
103	jnc	1f
104	movdqa	(up), %xmm0
105	lea	16(up), up
106	movdqa	%xmm0, (rp)
107	lea	16(rp), rp
108
1091:	bt	$0, n
110	jnc	1f
111	mov	(up), %r8
112	mov	%r8, (rp)
113
1141:	FUNC_EXIT()
115	ret
116
117L(uent):
118C Code handling up - rp = 8 (mod 16)
119
120C FIXME: The code below only handles overlap if it is close to complete, or
121C quite separate: up-rp < 5 or up-up > 15 limbs
122	lea	-40(up), %rax		C 40 = 5 * GMP_LIMB_BYTES
123	sub	rp, %rax
124	cmp	$80, %rax		C 80 = (15-5) * GMP_LIMB_BYTES
125	jbe	L(bc)			C deflect to plain loop
126
127	sub	$16, n
128	jc	L(uend)
129
130	movdqa	120(up), %xmm3
131
132	sub	$16, n
133	jmp	L(um)
134
135	ALIGN(16)
136L(utop):movdqa	120(up), %xmm3
137	movdqa	%xmm0, -128(rp)
138	sub	$16, n
139L(um):	movdqa	104(up), %xmm2
140	palignr($8, %xmm2, %xmm3)
141	movdqa	88(up), %xmm1
142	movdqa	%xmm3, 112(rp)
143	palignr($8, %xmm1, %xmm2)
144	movdqa	72(up), %xmm0
145	movdqa	%xmm2, 96(rp)
146	palignr($8, %xmm0, %xmm1)
147	movdqa	56(up), %xmm3
148	movdqa	%xmm1, 80(rp)
149	palignr($8, %xmm3, %xmm0)
150	movdqa	40(up), %xmm2
151	movdqa	%xmm0, 64(rp)
152	palignr($8, %xmm2, %xmm3)
153	movdqa	24(up), %xmm1
154	movdqa	%xmm3, 48(rp)
155	palignr($8, %xmm1, %xmm2)
156	movdqa	8(up), %xmm0
157	movdqa	%xmm2, 32(rp)
158	palignr($8, %xmm0, %xmm1)
159	movdqa	-8(up), %xmm3
160	movdqa	%xmm1, 16(rp)
161	palignr($8, %xmm3, %xmm0)
162	lea	128(up), up
163	lea	128(rp), rp
164	jnc	L(utop)
165
166	movdqa	%xmm0, -128(rp)
167
168L(uend):bt	$3, R32(n)
169	jnc	1f
170	movdqa	56(up), %xmm3
171	movdqa	40(up), %xmm2
172	palignr($8, %xmm2, %xmm3)
173	movdqa	24(up), %xmm1
174	movdqa	%xmm3, 48(rp)
175	palignr($8, %xmm1, %xmm2)
176	movdqa	8(up), %xmm0
177	movdqa	%xmm2, 32(rp)
178	palignr($8, %xmm0, %xmm1)
179	movdqa	-8(up), %xmm3
180	movdqa	%xmm1, 16(rp)
181	palignr($8, %xmm3, %xmm0)
182	lea	64(up), up
183	movdqa	%xmm0, (rp)
184	lea	64(rp), rp
185
1861:	bt	$2, R32(n)
187	jnc	1f
188	movdqa	24(up), %xmm1
189	movdqa	8(up), %xmm0
190	palignr($8, %xmm0, %xmm1)
191	movdqa	-8(up), %xmm3
192	movdqa	%xmm1, 16(rp)
193	palignr($8, %xmm3, %xmm0)
194	lea	32(up), up
195	movdqa	%xmm0, (rp)
196	lea	32(rp), rp
197
1981:	bt	$1, R32(n)
199	jnc	1f
200	movdqa	8(up), %xmm0
201	movdqa	-8(up), %xmm3
202	palignr($8, %xmm3, %xmm0)
203	lea	16(up), up
204	movdqa	%xmm0, (rp)
205	lea	16(rp), rp
206
2071:	bt	$0, n
208	jnc	1f
209	mov	(up), %r8
210	mov	%r8, (rp)
211
2121:	FUNC_EXIT()
213	ret
214
215C Basecase code.  Needed for good small operands speed, not for
216C correctness as the above code is currently written.
217
218L(bc):	lea	-8(rp), rp
219	sub	$4, R32(n)
220	jc	L(end)
221
222	ALIGN(16)
223L(top):	mov	(up), %r8
224	mov	8(up), %r9
225	lea	32(rp), rp
226	mov	16(up), %r10
227	mov	24(up), %r11
228	lea	32(up), up
229	mov	%r8, -24(rp)
230	mov	%r9, -16(rp)
231ifelse(eval(1 || COPYI_SSE_THRESHOLD >= 8),1,
232`	sub	$4, R32(n)')
233	mov	%r10, -8(rp)
234	mov	%r11, (rp)
235ifelse(eval(1 || COPYI_SSE_THRESHOLD >= 8),1,
236`	jnc	L(top)')
237
238L(end):	bt	$0, R32(n)
239	jnc	1f
240	mov	(up), %r8
241	mov	%r8, 8(rp)
242	lea	8(rp), rp
243	lea	8(up), up
2441:	bt	$1, R32(n)
245	jnc	1f
246	mov	(up), %r8
247	mov	8(up), %r9
248	mov	%r8, 8(rp)
249	mov	%r9, 16(rp)
2501:	FUNC_EXIT()
251	ret
252EPILOGUE()
253