xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/com.asm (revision aef5eb5f59cdfe8314f1b5f78ac04eb144e44010)
1dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
4dnl  Inc.
5
6dnl  Contributed to the GNU project by Torbjorn Granlund.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C	     cycles/limb     cycles/limb     cycles/limb      good
37C              aligned	      unaligned	      best seen	     for cpu?
38C AMD K8,K9	 2.0		 2.0				N
39C AMD K10	 0.85		 1.3				Y/N
40C AMD bull	 1.40		 1.40				Y
41C AMD pile     0.9-1.4	       0.9-1.4				Y
42C AMD steam
43C AMD excavator
44C AMD bobcat	 3.1		 3.1				N
45C AMD jaguar	 0.91		 0.91		opt/opt		Y
46C Intel P4	 2.28		 illop				Y
47C Intel core2	 1.02		 1.02				N
48C Intel NHM	 0.53		 0.68				Y
49C Intel SBR	 0.51		 0.75		opt/0.65	Y/N
50C Intel IBR	 0.50		 0.57		opt/opt		Y
51C Intel HWL	 0.51		 0.64		opt/0.58	Y
52C Intel BWL	 0.61		 0.65		0.57/opt	Y
53C Intel atom	 3.68		 3.68				N
54C Intel SLM	 1.09		 1.35				N
55C VIA nano	 1.17		 5.09				Y/N
56
57C We try to do as many 16-byte operations as possible.  The top-most and
58C bottom-most writes might need 8-byte operations.  We can always write using
59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
60C operations.
61
62C Instead of having separate loops for reading aligned and unaligned, we read
63C using MOVDQU.  This seems to work great except for core2; there performance
64C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
65C best handle the unaligned case there.
66
67C INPUT PARAMETERS
68define(`rp', `%rdi')
69define(`up', `%rsi')
70define(`n',  `%rdx')
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75ASM_START()
76	TEXT
77	ALIGN(16)
78PROLOGUE(mpn_com)
79	FUNC_ENTRY(3)
80
81IFDOS(`	add	$-56, %rsp	')
82IFDOS(`	movdqa	%xmm6, (%rsp)	')
83IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
84
85	pcmpeqb	%xmm7, %xmm7		C set to 111...111
86
87	test	$8, R8(rp)		C is rp 16-byte aligned?
88	jz	L(ali)			C jump if rp aligned
89	mov	(up), %rax
90	lea	8(up), up
91	not	%rax
92	mov	%rax, (rp)
93	lea	8(rp), rp
94	dec	n
95
96	sub	$14, n
97	jc	L(sma)
98
99	ALIGN(16)
100L(top):	movdqu	(up), %xmm0
101	movdqu	16(up), %xmm1
102	movdqu	32(up), %xmm2
103	movdqu	48(up), %xmm3
104	movdqu	64(up), %xmm4
105	movdqu	80(up), %xmm5
106	movdqu	96(up), %xmm6
107	lea	112(up), up
108	pxor	%xmm7, %xmm0
109	pxor	%xmm7, %xmm1
110	pxor	%xmm7, %xmm2
111	pxor	%xmm7, %xmm3
112	pxor	%xmm7, %xmm4
113	pxor	%xmm7, %xmm5
114	pxor	%xmm7, %xmm6
115	movdqa	%xmm0, (rp)
116	movdqa	%xmm1, 16(rp)
117	movdqa	%xmm2, 32(rp)
118	movdqa	%xmm3, 48(rp)
119	movdqa	%xmm4, 64(rp)
120	movdqa	%xmm5, 80(rp)
121	movdqa	%xmm6, 96(rp)
122	lea	112(rp), rp
123L(ali):	sub	$14, n
124	jnc	L(top)
125
126L(sma):	add	$14, n
127	test	$8, R8(n)
128	jz	1f
129	movdqu	(up), %xmm0
130	movdqu	16(up), %xmm1
131	movdqu	32(up), %xmm2
132	movdqu	48(up), %xmm3
133	lea	64(up), up
134	pxor	%xmm7, %xmm0
135	pxor	%xmm7, %xmm1
136	pxor	%xmm7, %xmm2
137	pxor	%xmm7, %xmm3
138	movdqa	%xmm0, (rp)
139	movdqa	%xmm1, 16(rp)
140	movdqa	%xmm2, 32(rp)
141	movdqa	%xmm3, 48(rp)
142	lea	64(rp), rp
1431:
144	test	$4, R8(n)
145	jz	1f
146	movdqu	(up), %xmm0
147	movdqu	16(up), %xmm1
148	lea	32(up), up
149	pxor	%xmm7, %xmm0
150	pxor	%xmm7, %xmm1
151	movdqa	%xmm0, (rp)
152	movdqa	%xmm1, 16(rp)
153	lea	32(rp), rp
1541:
155	test	$2, R8(n)
156	jz	1f
157	movdqu	(up), %xmm0
158	lea	16(up), up
159	pxor	%xmm7, %xmm0
160	movdqa	%xmm0, (rp)
161	lea	16(rp), rp
1621:
163	test	$1, R8(n)
164	jz	1f
165	mov	(up), %rax
166	not	%rax
167	mov	%rax, (rp)
1681:
169L(don):
170IFDOS(`	movdqa	(%rsp), %xmm6	')
171IFDOS(`	movdqa	16(%rsp), %xmm7	')
172IFDOS(`	add	$56, %rsp	')
173	FUNC_EXIT()
174	ret
175EPILOGUE()
176