xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86_64/fastsse/sec_tabselect.asm (revision a8c74629f602faa0ccf8a463757d7baf858bbf3a)
1dnl  AMD64 SSE mpn_sec_tabselect.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C	     cycles/limb     cycles/limb     cycles/limb
37C	      ali,evn n	     unal,evn n	      other cases
38C AMD K8,K9	 1.65		1.65		 1.8
39C AMD K10	 0.78		0.78		 0.85
40C AMD bd1	 0.80		0.91		 1.25
41C AMD bobcat	 2.15		2.15		 2.37
42C Intel P4	 2.5		2.5		 2.95
43C Intel core2	 1.17		1.25		 1.25
44C Intel NHM	 0.87		0.90		 0.90
45C Intel SBR	 0.63		0.79		 0.77
46C Intel atom	 4.3		 4.3		 4.3	slower than plain code
47C VIA nano	 1.4		 5.1		 3.14	too alignment dependent
48
49C NOTES
50C  * We only honour the least significant 32 bits of the `which' and `nents'
51C    arguments to allow efficient code using just SSE2.  We would need to
52C    either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
53C  * We use movd for copying between xmm and plain registers, since old gas
54C    rejects movq.  But gas assembles movd as movq when given a 64-bit greg.
55
56define(`rp',     `%rdi')
57define(`tp',     `%rsi')
58define(`n',      `%rdx')
59define(`nents',  `%rcx')
60define(`which',  `%r8')
61
62define(`i',      `%r10')
63define(`j',      `%r9')
64
65C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
66C          nents  n   rp   tab       which j    i   temp  *    *    *    *
67
68ABI_SUPPORT(DOS64)
69ABI_SUPPORT(STD64)
70
71ASM_START()
72	TEXT
73	ALIGN(16)
74PROLOGUE(mpn_sec_tabselect)
75	FUNC_ENTRY(4)
76IFDOS(`	mov	56(%rsp), %r8d	')
77
78IFDOS(`	add	$-88, %rsp	')
79IFDOS(`	movdqu	%xmm6, (%rsp)	')
80IFDOS(`	movdqu	%xmm7, 16(%rsp)	')
81IFDOS(`	movdqu	%xmm8, 32(%rsp)	')
82IFDOS(`	movdqu	%xmm9, 48(%rsp)	')
83
84	movd	which, %xmm8
85	pshufd	$0, %xmm8, %xmm8	C 4 `which' copies
86	mov	$1, R32(%rax)
87	movd	%rax, %xmm9
88	pshufd	$0, %xmm9, %xmm9	C 4 copies of 1
89
90	mov	n, j
91	add	$-8, j
92	js	L(outer_end)
93
94L(outer_top):
95	mov	nents, i
96	mov	tp, %r11
97	pxor	%xmm1, %xmm1
98	pxor	%xmm4, %xmm4
99	pxor	%xmm5, %xmm5
100	pxor	%xmm6, %xmm6
101	pxor	%xmm7, %xmm7
102	ALIGN(16)
103L(top):	movdqa	%xmm8, %xmm0
104	pcmpeqd	%xmm1, %xmm0
105	paddd	%xmm9, %xmm1
106	movdqu	0(tp), %xmm2
107	movdqu	16(tp), %xmm3
108	pand	%xmm0, %xmm2
109	pand	%xmm0, %xmm3
110	por	%xmm2, %xmm4
111	por	%xmm3, %xmm5
112	movdqu	32(tp), %xmm2
113	movdqu	48(tp), %xmm3
114	pand	%xmm0, %xmm2
115	pand	%xmm0, %xmm3
116	por	%xmm2, %xmm6
117	por	%xmm3, %xmm7
118	lea	(tp,n,8), tp
119	add	$-1, i
120	jne	L(top)
121
122	movdqu	%xmm4, 0(rp)
123	movdqu	%xmm5, 16(rp)
124	movdqu	%xmm6, 32(rp)
125	movdqu	%xmm7, 48(rp)
126
127	lea	64(%r11), tp
128	lea	64(rp), rp
129	add	$-8, j
130	jns	L(outer_top)
131L(outer_end):
132
133	test	$4, R8(n)
134	je	L(b0xx)
135L(b1xx):mov	nents, i
136	mov	tp, %r11
137	pxor	%xmm1, %xmm1
138	pxor	%xmm4, %xmm4
139	pxor	%xmm5, %xmm5
140	ALIGN(16)
141L(tp4):	movdqa	%xmm8, %xmm0
142	pcmpeqd	%xmm1, %xmm0
143	paddd	%xmm9, %xmm1
144	movdqu	0(tp), %xmm2
145	movdqu	16(tp), %xmm3
146	pand	%xmm0, %xmm2
147	pand	%xmm0, %xmm3
148	por	%xmm2, %xmm4
149	por	%xmm3, %xmm5
150	lea	(tp,n,8), tp
151	add	$-1, i
152	jne	L(tp4)
153	movdqu	%xmm4, 0(rp)
154	movdqu	%xmm5, 16(rp)
155	lea	32(%r11), tp
156	lea	32(rp), rp
157
158L(b0xx):test	$2, R8(n)
159	je	L(b00x)
160L(b01x):mov	nents, i
161	mov	tp, %r11
162	pxor	%xmm1, %xmm1
163	pxor	%xmm4, %xmm4
164	ALIGN(16)
165L(tp2):	movdqa	%xmm8, %xmm0
166	pcmpeqd	%xmm1, %xmm0
167	paddd	%xmm9, %xmm1
168	movdqu	0(tp), %xmm2
169	pand	%xmm0, %xmm2
170	por	%xmm2, %xmm4
171	lea	(tp,n,8), tp
172	add	$-1, i
173	jne	L(tp2)
174	movdqu	%xmm4, 0(rp)
175	lea	16(%r11), tp
176	lea	16(rp), rp
177
178L(b00x):test	$1, R8(n)
179	je	L(b000)
180L(b001):mov	nents, i
181	mov	tp, %r11
182	pxor	%xmm1, %xmm1
183	pxor	%xmm4, %xmm4
184	ALIGN(16)
185L(tp1):	movdqa	%xmm8, %xmm0
186	pcmpeqd	%xmm1, %xmm0
187	paddd	%xmm9, %xmm1
188	movq	0(tp), %xmm2
189	pand	%xmm0, %xmm2
190	por	%xmm2, %xmm4
191	lea	(tp,n,8), tp
192	add	$-1, i
193	jne	L(tp1)
194	movq	%xmm4, 0(rp)
195
196L(b000):
197IFDOS(`	movdqu	(%rsp), %xmm6	')
198IFDOS(`	movdqu	16(%rsp), %xmm7	')
199IFDOS(`	movdqu	32(%rsp), %xmm8	')
200IFDOS(`	movdqu	48(%rsp), %xmm9	')
201IFDOS(`	add	$88, %rsp	')
202	FUNC_EXIT()
203	ret
204EPILOGUE()
205