xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/copyi-palignr.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
2
3dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjörn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C AMD K8,K9	 2.0		 illop		1.0/1.0		N
38C AMD K10	 0.85		 illop				Y/N
39C AMD bd1	 0.70		 0.66				Y
40C AMD bd2	 0.68		 0.66				Y
41C AMD bd3	 ?		 ?
42C AMD bd4	 ?		 ?
43C AMD bt1	 1.97		 8.16		1.5/1.5		N
44C AMD bt2	 0.77		 0.93		0.65/opt	N/Y
45C AMD zn1	 ?		 ?
46C AMD zn2	 ?		 ?
47C Intel P4	 2.26		 illop				Y/N
48C Intel CNR	 0.52		 0.64		opt/opt		Y
49C Intel NHM	 0.52		 0.71		0.50/0.67	N
50C Intel SBR	 0.51		 0.54		opt/0.51	Y
51C Intel IBR	 0.50		 0.54		opt/opt		Y
52C Intel HWL	 0.50		 0.51		opt/opt		Y
53C Intel BWL	 0.55		 0.55		opt/opt		Y
54C Intel atom	 1.16		 1.61		opt/opt		Y
55C Intel SLM	 1.02		 1.07		opt/opt		Y
56C VIA nano	 1.09		 1.08		opt/opt		Y
57
58C We use only 16-byte operations, except for unaligned top-most and bottom-most
59C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
60C instruction is better adapted to mpn_copyd's needs, we need to contort the
61C code to use it here.
62C
63C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
64C taken from the x86_64 default code.
65
66C INPUT PARAMETERS
67define(`rp', `%rdi')
68define(`up', `%rsi')
69define(`n',  `%rdx')
70
71C There are three instructions for loading an aligned 128-bit quantity.  We use
72C movaps, since it has the shortest coding.
73dnl define(`movdqa', ``movaps'')
74
75ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
76
77ASM_START()
78	TEXT
79	ALIGN(64)
80PROLOGUE(mpn_copyi)
81	FUNC_ENTRY(3)
82
83	cmp	$COPYI_SSE_THRESHOLD, n
84	jbe	L(bc)
85
86	test	$8, R8(rp)		C is rp 16-byte aligned?
87	jz	L(rp_aligned)		C jump if rp aligned
88
89	movsq				C copy one limb
90	dec	n
91
92L(rp_aligned):
93	test	$8, R8(up)
94	jnz	L(uent)
95
96ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
97`	sub	$8, n',
98`	jmp	L(am)')
99
100	ALIGN(16)
101L(atop):movdqa	0(up), %xmm0
102	movdqa	16(up), %xmm1
103	movdqa	32(up), %xmm2
104	movdqa	48(up), %xmm3
105	lea	64(up), up
106	movdqa	%xmm0, (rp)
107	movdqa	%xmm1, 16(rp)
108	movdqa	%xmm2, 32(rp)
109	movdqa	%xmm3, 48(rp)
110	lea	64(rp), rp
111L(am):	sub	$8, n
112	jnc	L(atop)
113
114	test	$4, R8(n)
115	jz	1f
116	movdqa	(up), %xmm0
117	movdqa	16(up), %xmm1
118	lea	32(up), up
119	movdqa	%xmm0, (rp)
120	movdqa	%xmm1, 16(rp)
121	lea	32(rp), rp
122
1231:	test	$2, R8(n)
124	jz	1f
125	movdqa	(up), %xmm0
126	lea	16(up), up
127	movdqa	%xmm0, (rp)
128	lea	16(rp), rp
129
1301:	test	$1, R8(n)
131	jz	1f
132	mov	(up), %r8
133	mov	%r8, (rp)
134
1351:	FUNC_EXIT()
136	ret
137
138L(uent):
139C Code handling up - rp = 8 (mod 16)
140
141	cmp	$16, n
142	jc	L(ued0)
143
144IFDOS(`	add	$-56, %rsp	')
145IFDOS(`	movdqa	%xmm6, (%rsp)	')
146IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
147IFDOS(`	movdqa	%xmm8, 32(%rsp)	')
148
149	movaps	120(up), %xmm7
150	movaps	104(up), %xmm6
151	movaps	88(up), %xmm5
152	movaps	72(up), %xmm4
153	movaps	56(up), %xmm3
154	movaps	40(up), %xmm2
155	lea	128(up), up
156	sub	$32, n
157	jc	L(ued1)
158
159	ALIGN(16)
160L(utop):movaps	-104(up), %xmm1
161	sub	$16, n
162	movaps	-120(up), %xmm0
163	palignr($8, %xmm6, %xmm7)
164	movaps	-136(up), %xmm8
165	movdqa	%xmm7, 112(rp)
166	palignr($8, %xmm5, %xmm6)
167	movaps	120(up), %xmm7
168	movdqa	%xmm6, 96(rp)
169	palignr($8, %xmm4, %xmm5)
170	movaps	104(up), %xmm6
171	movdqa	%xmm5, 80(rp)
172	palignr($8, %xmm3, %xmm4)
173	movaps	88(up), %xmm5
174	movdqa	%xmm4, 64(rp)
175	palignr($8, %xmm2, %xmm3)
176	movaps	72(up), %xmm4
177	movdqa	%xmm3, 48(rp)
178	palignr($8, %xmm1, %xmm2)
179	movaps	56(up), %xmm3
180	movdqa	%xmm2, 32(rp)
181	palignr($8, %xmm0, %xmm1)
182	movaps	40(up), %xmm2
183	movdqa	%xmm1, 16(rp)
184	palignr($8, %xmm8, %xmm0)
185	lea	128(up), up
186	movdqa	%xmm0, (rp)
187	lea	128(rp), rp
188	jnc	L(utop)
189
190L(ued1):movaps	-104(up), %xmm1
191	movaps	-120(up), %xmm0
192	movaps	-136(up), %xmm8
193	palignr($8, %xmm6, %xmm7)
194	movdqa	%xmm7, 112(rp)
195	palignr($8, %xmm5, %xmm6)
196	movdqa	%xmm6, 96(rp)
197	palignr($8, %xmm4, %xmm5)
198	movdqa	%xmm5, 80(rp)
199	palignr($8, %xmm3, %xmm4)
200	movdqa	%xmm4, 64(rp)
201	palignr($8, %xmm2, %xmm3)
202	movdqa	%xmm3, 48(rp)
203	palignr($8, %xmm1, %xmm2)
204	movdqa	%xmm2, 32(rp)
205	palignr($8, %xmm0, %xmm1)
206	movdqa	%xmm1, 16(rp)
207	palignr($8, %xmm8, %xmm0)
208	movdqa	%xmm0, (rp)
209	lea	128(rp), rp
210
211IFDOS(`	movdqa	(%rsp), %xmm6	')
212IFDOS(`	movdqa	16(%rsp), %xmm7	')
213IFDOS(`	movdqa	32(%rsp), %xmm8	')
214IFDOS(`	add	$56, %rsp	')
215
216L(ued0):test	$8, R8(n)
217	jz	1f
218	movaps	56(up), %xmm3
219	movaps	40(up), %xmm2
220	movaps	24(up), %xmm1
221	movaps	8(up), %xmm0
222	movaps	-8(up), %xmm4
223	palignr($8, %xmm2, %xmm3)
224	movdqa	%xmm3, 48(rp)
225	palignr($8, %xmm1, %xmm2)
226	movdqa	%xmm2, 32(rp)
227	palignr($8, %xmm0, %xmm1)
228	movdqa	%xmm1, 16(rp)
229	palignr($8, %xmm4, %xmm0)
230	lea	64(up), up
231	movdqa	%xmm0, (rp)
232	lea	64(rp), rp
233
2341:	test	$4, R8(n)
235	jz	1f
236	movaps	24(up), %xmm1
237	movaps	8(up), %xmm0
238	palignr($8, %xmm0, %xmm1)
239	movaps	-8(up), %xmm3
240	movdqa	%xmm1, 16(rp)
241	palignr($8, %xmm3, %xmm0)
242	lea	32(up), up
243	movdqa	%xmm0, (rp)
244	lea	32(rp), rp
245
2461:	test	$2, R8(n)
247	jz	1f
248	movdqa	8(up), %xmm0
249	movdqa	-8(up), %xmm3
250	palignr($8, %xmm3, %xmm0)
251	lea	16(up), up
252	movdqa	%xmm0, (rp)
253	lea	16(rp), rp
254
2551:	test	$1, R8(n)
256	jz	1f
257	mov	(up), %r8
258	mov	%r8, (rp)
259
2601:	FUNC_EXIT()
261	ret
262
263C Basecase code.  Needed for good small operands speed, not for
264C correctness as the above code is currently written.
265
266L(bc):	lea	-8(rp), rp
267	sub	$4, R32(n)
268	jc	L(end)
269
270	ALIGN(16)
271L(top):	mov	(up), %r8
272	mov	8(up), %r9
273	lea	32(rp), rp
274	mov	16(up), %r10
275	mov	24(up), %r11
276	lea	32(up), up
277	mov	%r8, -24(rp)
278	mov	%r9, -16(rp)
279ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
280`	sub	$4, R32(n)')
281	mov	%r10, -8(rp)
282	mov	%r11, (rp)
283ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
284`	jnc	L(top)')
285
286L(end):	test	$1, R8(n)
287	jz	1f
288	mov	(up), %r8
289	mov	%r8, 8(rp)
290	lea	8(rp), rp
291	lea	8(up), up
2921:	test	$2, R8(n)
293	jz	1f
294	mov	(up), %r8
295	mov	8(up), %r9
296	mov	%r8, 8(rp)
297	mov	%r9, 16(rp)
2981:	FUNC_EXIT()
299	ret
300EPILOGUE()
301