xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/com.asm (revision 16dce51364ebe8aeafbae46bc5aa167b8115bc45)
1dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
4dnl  Inc.
5
6dnl  Contributed to the GNU project by Torbjorn Granlund.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C	     cycles/limb     cycles/limb     cycles/limb      good
37C              aligned	      unaligned	      best seen	     for cpu?
38C AMD K8,K9	 2.0		 2.0				N
39C AMD K10	 0.85		 1.3				Y/N
40C AMD bull	 1.40		 1.40				Y
41C AMD pile     0.9-1.4	       0.9-1.4				Y
42C AMD steam
43C AMD excavator
44C AMD bobcat	 3.1		 3.1				N
45C AMD jaguar	 0.91		 0.91		opt/opt		Y
46C Intel P4	 2.28		 illop				Y
47C Intel core2	 1.02		 1.02				N
48C Intel NHM	 0.53		 0.68				Y
49C Intel SBR	 0.51		 0.75		opt/0.65	Y/N
50C Intel IBR	 0.50		 0.57		opt/opt		Y
51C Intel HWL	 0.51		 0.64		opt/0.58	Y
52C Intel BWL	 0.61		 0.65		0.57/opt	Y
53C Intel atom	 3.68		 3.68				N
54C Intel SLM	 1.09		 1.35				N
55C VIA nano	 1.17		 5.09				Y/N
56
57C We try to do as many 16-byte operations as possible.  The top-most and
58C bottom-most writes might need 8-byte operations.  We can always write using
59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
60C operations.
61
62C Instead of having separate loops for reading aligned and unaligned, we read
63C using MOVDQU.  This seems to work great except for core2; there performance
64C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
65C best handle the unaligned case there.
66
67C INPUT PARAMETERS
68define(`rp', `%rdi')
69define(`up', `%rsi')
70define(`n',  `%rdx')
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75ASM_START()
76	TEXT
77	ALIGN(16)
78PROLOGUE(mpn_com)
79	FUNC_ENTRY(3)
80
81	pcmpeqb	%xmm7, %xmm7		C set to 111...111
82
83	test	$8, R8(rp)		C is rp 16-byte aligned?
84	jz	L(ali)			C jump if rp aligned
85	mov	(up), %rax
86	lea	8(up), up
87	not	%rax
88	mov	%rax, (rp)
89	lea	8(rp), rp
90	dec	n
91
92	sub	$14, n
93	jc	L(sma)
94
95	ALIGN(16)
96L(top):	movdqu	(up), %xmm0
97	movdqu	16(up), %xmm1
98	movdqu	32(up), %xmm2
99	movdqu	48(up), %xmm3
100	movdqu	64(up), %xmm4
101	movdqu	80(up), %xmm5
102	movdqu	96(up), %xmm6
103	lea	112(up), up
104	pxor	%xmm7, %xmm0
105	pxor	%xmm7, %xmm1
106	pxor	%xmm7, %xmm2
107	pxor	%xmm7, %xmm3
108	pxor	%xmm7, %xmm4
109	pxor	%xmm7, %xmm5
110	pxor	%xmm7, %xmm6
111	movdqa	%xmm0, (rp)
112	movdqa	%xmm1, 16(rp)
113	movdqa	%xmm2, 32(rp)
114	movdqa	%xmm3, 48(rp)
115	movdqa	%xmm4, 64(rp)
116	movdqa	%xmm5, 80(rp)
117	movdqa	%xmm6, 96(rp)
118	lea	112(rp), rp
119L(ali):	sub	$14, n
120	jnc	L(top)
121
122L(sma):	add	$14, n
123	test	$8, R8(n)
124	jz	1f
125	movdqu	(up), %xmm0
126	movdqu	16(up), %xmm1
127	movdqu	32(up), %xmm2
128	movdqu	48(up), %xmm3
129	lea	64(up), up
130	pxor	%xmm7, %xmm0
131	pxor	%xmm7, %xmm1
132	pxor	%xmm7, %xmm2
133	pxor	%xmm7, %xmm3
134	movdqa	%xmm0, (rp)
135	movdqa	%xmm1, 16(rp)
136	movdqa	%xmm2, 32(rp)
137	movdqa	%xmm3, 48(rp)
138	lea	64(rp), rp
1391:
140	test	$4, R8(n)
141	jz	1f
142	movdqu	(up), %xmm0
143	movdqu	16(up), %xmm1
144	lea	32(up), up
145	pxor	%xmm7, %xmm0
146	pxor	%xmm7, %xmm1
147	movdqa	%xmm0, (rp)
148	movdqa	%xmm1, 16(rp)
149	lea	32(rp), rp
1501:
151	test	$2, R8(n)
152	jz	1f
153	movdqu	(up), %xmm0
154	lea	16(up), up
155	pxor	%xmm7, %xmm0
156	movdqa	%xmm0, (rp)
157	lea	16(rp), rp
1581:
159	test	$1, R8(n)
160	jz	1f
161	mov	(up), %rax
162	not	%rax
163	mov	%rax, (rp)
1641:
165L(don):	FUNC_EXIT()
166	ret
167EPILOGUE()
168