xref: /netbsd-src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/chacha-x86.S (revision 1b3d6f93806f8821fe459e13ad13e605b37c6d43)
1#include <machine/asm.h>
2.text
3.globl	ChaCha20_ctr32
4.type	ChaCha20_ctr32,@function
5.align	16
6ChaCha20_ctr32:
7.L_ChaCha20_ctr32_begin:
8	#ifdef __CET__
9
10.byte	243,15,30,251
11	#endif
12
13	pushl	%ebp
14	pushl	%ebx
15	pushl	%esi
16	pushl	%edi
17	xorl	%eax,%eax
18	cmpl	28(%esp),%eax
19	je	.L000no_data
20	call	.Lpic_point
21.Lpic_point:
22	popl	%eax
23	leal	OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp
24	testl	$16777216,(%ebp)
25	jz	.L001x86
26	testl	$512,4(%ebp)
27	jz	.L001x86
28	jmp	.Lssse3_shortcut
29.L001x86:
30	movl	32(%esp),%esi
31	movl	36(%esp),%edi
32	subl	$132,%esp
33	movl	(%esi),%eax
34	movl	4(%esi),%ebx
35	movl	8(%esi),%ecx
36	movl	12(%esi),%edx
37	movl	%eax,80(%esp)
38	movl	%ebx,84(%esp)
39	movl	%ecx,88(%esp)
40	movl	%edx,92(%esp)
41	movl	16(%esi),%eax
42	movl	20(%esi),%ebx
43	movl	24(%esi),%ecx
44	movl	28(%esi),%edx
45	movl	%eax,96(%esp)
46	movl	%ebx,100(%esp)
47	movl	%ecx,104(%esp)
48	movl	%edx,108(%esp)
49	movl	(%edi),%eax
50	movl	4(%edi),%ebx
51	movl	8(%edi),%ecx
52	movl	12(%edi),%edx
53	subl	$1,%eax
54	movl	%eax,112(%esp)
55	movl	%ebx,116(%esp)
56	movl	%ecx,120(%esp)
57	movl	%edx,124(%esp)
58	jmp	.L002entry
59.align	16
60.L003outer_loop:
61	movl	%ebx,156(%esp)
62	movl	%eax,152(%esp)
63	movl	%ecx,160(%esp)
64.L002entry:
65	movl	$1634760805,%eax
66	movl	$857760878,4(%esp)
67	movl	$2036477234,8(%esp)
68	movl	$1797285236,12(%esp)
69	movl	84(%esp),%ebx
70	movl	88(%esp),%ebp
71	movl	104(%esp),%ecx
72	movl	108(%esp),%esi
73	movl	116(%esp),%edx
74	movl	120(%esp),%edi
75	movl	%ebx,20(%esp)
76	movl	%ebp,24(%esp)
77	movl	%ecx,40(%esp)
78	movl	%esi,44(%esp)
79	movl	%edx,52(%esp)
80	movl	%edi,56(%esp)
81	movl	92(%esp),%ebx
82	movl	124(%esp),%edi
83	movl	112(%esp),%edx
84	movl	80(%esp),%ebp
85	movl	96(%esp),%ecx
86	movl	100(%esp),%esi
87	addl	$1,%edx
88	movl	%ebx,28(%esp)
89	movl	%edi,60(%esp)
90	movl	%edx,112(%esp)
91	movl	$10,%ebx
92	jmp	.L004loop
93.align	16
94.L004loop:
95	addl	%ebp,%eax
96	movl	%ebx,128(%esp)
97	movl	%ebp,%ebx
98	xorl	%eax,%edx
99	roll	$16,%edx
100	addl	%edx,%ecx
101	xorl	%ecx,%ebx
102	movl	52(%esp),%edi
103	roll	$12,%ebx
104	movl	20(%esp),%ebp
105	addl	%ebx,%eax
106	xorl	%eax,%edx
107	movl	%eax,(%esp)
108	roll	$8,%edx
109	movl	4(%esp),%eax
110	addl	%edx,%ecx
111	movl	%edx,48(%esp)
112	xorl	%ecx,%ebx
113	addl	%ebp,%eax
114	roll	$7,%ebx
115	xorl	%eax,%edi
116	movl	%ecx,32(%esp)
117	roll	$16,%edi
118	movl	%ebx,16(%esp)
119	addl	%edi,%esi
120	movl	40(%esp),%ecx
121	xorl	%esi,%ebp
122	movl	56(%esp),%edx
123	roll	$12,%ebp
124	movl	24(%esp),%ebx
125	addl	%ebp,%eax
126	xorl	%eax,%edi
127	movl	%eax,4(%esp)
128	roll	$8,%edi
129	movl	8(%esp),%eax
130	addl	%edi,%esi
131	movl	%edi,52(%esp)
132	xorl	%esi,%ebp
133	addl	%ebx,%eax
134	roll	$7,%ebp
135	xorl	%eax,%edx
136	movl	%esi,36(%esp)
137	roll	$16,%edx
138	movl	%ebp,20(%esp)
139	addl	%edx,%ecx
140	movl	44(%esp),%esi
141	xorl	%ecx,%ebx
142	movl	60(%esp),%edi
143	roll	$12,%ebx
144	movl	28(%esp),%ebp
145	addl	%ebx,%eax
146	xorl	%eax,%edx
147	movl	%eax,8(%esp)
148	roll	$8,%edx
149	movl	12(%esp),%eax
150	addl	%edx,%ecx
151	movl	%edx,56(%esp)
152	xorl	%ecx,%ebx
153	addl	%ebp,%eax
154	roll	$7,%ebx
155	xorl	%eax,%edi
156	roll	$16,%edi
157	movl	%ebx,24(%esp)
158	addl	%edi,%esi
159	xorl	%esi,%ebp
160	roll	$12,%ebp
161	movl	20(%esp),%ebx
162	addl	%ebp,%eax
163	xorl	%eax,%edi
164	movl	%eax,12(%esp)
165	roll	$8,%edi
166	movl	(%esp),%eax
167	addl	%edi,%esi
168	movl	%edi,%edx
169	xorl	%esi,%ebp
170	addl	%ebx,%eax
171	roll	$7,%ebp
172	xorl	%eax,%edx
173	roll	$16,%edx
174	movl	%ebp,28(%esp)
175	addl	%edx,%ecx
176	xorl	%ecx,%ebx
177	movl	48(%esp),%edi
178	roll	$12,%ebx
179	movl	24(%esp),%ebp
180	addl	%ebx,%eax
181	xorl	%eax,%edx
182	movl	%eax,(%esp)
183	roll	$8,%edx
184	movl	4(%esp),%eax
185	addl	%edx,%ecx
186	movl	%edx,60(%esp)
187	xorl	%ecx,%ebx
188	addl	%ebp,%eax
189	roll	$7,%ebx
190	xorl	%eax,%edi
191	movl	%ecx,40(%esp)
192	roll	$16,%edi
193	movl	%ebx,20(%esp)
194	addl	%edi,%esi
195	movl	32(%esp),%ecx
196	xorl	%esi,%ebp
197	movl	52(%esp),%edx
198	roll	$12,%ebp
199	movl	28(%esp),%ebx
200	addl	%ebp,%eax
201	xorl	%eax,%edi
202	movl	%eax,4(%esp)
203	roll	$8,%edi
204	movl	8(%esp),%eax
205	addl	%edi,%esi
206	movl	%edi,48(%esp)
207	xorl	%esi,%ebp
208	addl	%ebx,%eax
209	roll	$7,%ebp
210	xorl	%eax,%edx
211	movl	%esi,44(%esp)
212	roll	$16,%edx
213	movl	%ebp,24(%esp)
214	addl	%edx,%ecx
215	movl	36(%esp),%esi
216	xorl	%ecx,%ebx
217	movl	56(%esp),%edi
218	roll	$12,%ebx
219	movl	16(%esp),%ebp
220	addl	%ebx,%eax
221	xorl	%eax,%edx
222	movl	%eax,8(%esp)
223	roll	$8,%edx
224	movl	12(%esp),%eax
225	addl	%edx,%ecx
226	movl	%edx,52(%esp)
227	xorl	%ecx,%ebx
228	addl	%ebp,%eax
229	roll	$7,%ebx
230	xorl	%eax,%edi
231	roll	$16,%edi
232	movl	%ebx,28(%esp)
233	addl	%edi,%esi
234	xorl	%esi,%ebp
235	movl	48(%esp),%edx
236	roll	$12,%ebp
237	movl	128(%esp),%ebx
238	addl	%ebp,%eax
239	xorl	%eax,%edi
240	movl	%eax,12(%esp)
241	roll	$8,%edi
242	movl	(%esp),%eax
243	addl	%edi,%esi
244	movl	%edi,56(%esp)
245	xorl	%esi,%ebp
246	roll	$7,%ebp
247	decl	%ebx
248	jnz	.L004loop
249	movl	160(%esp),%ebx
250	addl	$1634760805,%eax
251	addl	80(%esp),%ebp
252	addl	96(%esp),%ecx
253	addl	100(%esp),%esi
254	cmpl	$64,%ebx
255	jb	.L005tail
256	movl	156(%esp),%ebx
257	addl	112(%esp),%edx
258	addl	120(%esp),%edi
259	xorl	(%ebx),%eax
260	xorl	16(%ebx),%ebp
261	movl	%eax,(%esp)
262	movl	152(%esp),%eax
263	xorl	32(%ebx),%ecx
264	xorl	36(%ebx),%esi
265	xorl	48(%ebx),%edx
266	xorl	56(%ebx),%edi
267	movl	%ebp,16(%eax)
268	movl	%ecx,32(%eax)
269	movl	%esi,36(%eax)
270	movl	%edx,48(%eax)
271	movl	%edi,56(%eax)
272	movl	4(%esp),%ebp
273	movl	8(%esp),%ecx
274	movl	12(%esp),%esi
275	movl	20(%esp),%edx
276	movl	24(%esp),%edi
277	addl	$857760878,%ebp
278	addl	$2036477234,%ecx
279	addl	$1797285236,%esi
280	addl	84(%esp),%edx
281	addl	88(%esp),%edi
282	xorl	4(%ebx),%ebp
283	xorl	8(%ebx),%ecx
284	xorl	12(%ebx),%esi
285	xorl	20(%ebx),%edx
286	xorl	24(%ebx),%edi
287	movl	%ebp,4(%eax)
288	movl	%ecx,8(%eax)
289	movl	%esi,12(%eax)
290	movl	%edx,20(%eax)
291	movl	%edi,24(%eax)
292	movl	28(%esp),%ebp
293	movl	40(%esp),%ecx
294	movl	44(%esp),%esi
295	movl	52(%esp),%edx
296	movl	60(%esp),%edi
297	addl	92(%esp),%ebp
298	addl	104(%esp),%ecx
299	addl	108(%esp),%esi
300	addl	116(%esp),%edx
301	addl	124(%esp),%edi
302	xorl	28(%ebx),%ebp
303	xorl	40(%ebx),%ecx
304	xorl	44(%ebx),%esi
305	xorl	52(%ebx),%edx
306	xorl	60(%ebx),%edi
307	leal	64(%ebx),%ebx
308	movl	%ebp,28(%eax)
309	movl	(%esp),%ebp
310	movl	%ecx,40(%eax)
311	movl	160(%esp),%ecx
312	movl	%esi,44(%eax)
313	movl	%edx,52(%eax)
314	movl	%edi,60(%eax)
315	movl	%ebp,(%eax)
316	leal	64(%eax),%eax
317	subl	$64,%ecx
318	jnz	.L003outer_loop
319	jmp	.L006done
320.L005tail:
321	addl	112(%esp),%edx
322	addl	120(%esp),%edi
323	movl	%eax,(%esp)
324	movl	%ebp,16(%esp)
325	movl	%ecx,32(%esp)
326	movl	%esi,36(%esp)
327	movl	%edx,48(%esp)
328	movl	%edi,56(%esp)
329	movl	4(%esp),%ebp
330	movl	8(%esp),%ecx
331	movl	12(%esp),%esi
332	movl	20(%esp),%edx
333	movl	24(%esp),%edi
334	addl	$857760878,%ebp
335	addl	$2036477234,%ecx
336	addl	$1797285236,%esi
337	addl	84(%esp),%edx
338	addl	88(%esp),%edi
339	movl	%ebp,4(%esp)
340	movl	%ecx,8(%esp)
341	movl	%esi,12(%esp)
342	movl	%edx,20(%esp)
343	movl	%edi,24(%esp)
344	movl	28(%esp),%ebp
345	movl	40(%esp),%ecx
346	movl	44(%esp),%esi
347	movl	52(%esp),%edx
348	movl	60(%esp),%edi
349	addl	92(%esp),%ebp
350	addl	104(%esp),%ecx
351	addl	108(%esp),%esi
352	addl	116(%esp),%edx
353	addl	124(%esp),%edi
354	movl	%ebp,28(%esp)
355	movl	156(%esp),%ebp
356	movl	%ecx,40(%esp)
357	movl	152(%esp),%ecx
358	movl	%esi,44(%esp)
359	xorl	%esi,%esi
360	movl	%edx,52(%esp)
361	movl	%edi,60(%esp)
362	xorl	%eax,%eax
363	xorl	%edx,%edx
364.L007tail_loop:
365	movb	(%esi,%ebp,1),%al
366	movb	(%esp,%esi,1),%dl
367	leal	1(%esi),%esi
368	xorb	%dl,%al
369	movb	%al,-1(%ecx,%esi,1)
370	decl	%ebx
371	jnz	.L007tail_loop
372.L006done:
373	addl	$132,%esp
374.L000no_data:
375	popl	%edi
376	popl	%esi
377	popl	%ebx
378	popl	%ebp
379	ret
380.size	ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin
381.globl	ChaCha20_ssse3
382.type	ChaCha20_ssse3,@function
383.align	16
384ChaCha20_ssse3:
385.L_ChaCha20_ssse3_begin:
386	#ifdef __CET__
387
388.byte	243,15,30,251
389	#endif
390
391	pushl	%ebp
392	pushl	%ebx
393	pushl	%esi
394	pushl	%edi
395.Lssse3_shortcut:
396	movl	20(%esp),%edi
397	movl	24(%esp),%esi
398	movl	28(%esp),%ecx
399	movl	32(%esp),%edx
400	movl	36(%esp),%ebx
401	movl	%esp,%ebp
402	subl	$524,%esp
403	andl	$-64,%esp
404	movl	%ebp,512(%esp)
405	leal	.Lssse3_data-.Lpic_point(%eax),%eax
406	movdqu	(%ebx),%xmm3
407.L0081x:
408	movdqa	32(%eax),%xmm0
409	movdqu	(%edx),%xmm1
410	movdqu	16(%edx),%xmm2
411	movdqa	(%eax),%xmm6
412	movdqa	16(%eax),%xmm7
413	movl	%ebp,48(%esp)
414	movdqa	%xmm0,(%esp)
415	movdqa	%xmm1,16(%esp)
416	movdqa	%xmm2,32(%esp)
417	movdqa	%xmm3,48(%esp)
418	movl	$10,%edx
419	jmp	.L009loop1x
420.align	16
421.L010outer1x:
422	movdqa	80(%eax),%xmm3
423	movdqa	(%esp),%xmm0
424	movdqa	16(%esp),%xmm1
425	movdqa	32(%esp),%xmm2
426	paddd	48(%esp),%xmm3
427	movl	$10,%edx
428	movdqa	%xmm3,48(%esp)
429	jmp	.L009loop1x
430.align	16
431.L009loop1x:
432	paddd	%xmm1,%xmm0
433	pxor	%xmm0,%xmm3
434.byte	102,15,56,0,222
435	paddd	%xmm3,%xmm2
436	pxor	%xmm2,%xmm1
437	movdqa	%xmm1,%xmm4
438	psrld	$20,%xmm1
439	pslld	$12,%xmm4
440	por	%xmm4,%xmm1
441	paddd	%xmm1,%xmm0
442	pxor	%xmm0,%xmm3
443.byte	102,15,56,0,223
444	paddd	%xmm3,%xmm2
445	pxor	%xmm2,%xmm1
446	movdqa	%xmm1,%xmm4
447	psrld	$25,%xmm1
448	pslld	$7,%xmm4
449	por	%xmm4,%xmm1
450	pshufd	$78,%xmm2,%xmm2
451	pshufd	$57,%xmm1,%xmm1
452	pshufd	$147,%xmm3,%xmm3
453	nop
454	paddd	%xmm1,%xmm0
455	pxor	%xmm0,%xmm3
456.byte	102,15,56,0,222
457	paddd	%xmm3,%xmm2
458	pxor	%xmm2,%xmm1
459	movdqa	%xmm1,%xmm4
460	psrld	$20,%xmm1
461	pslld	$12,%xmm4
462	por	%xmm4,%xmm1
463	paddd	%xmm1,%xmm0
464	pxor	%xmm0,%xmm3
465.byte	102,15,56,0,223
466	paddd	%xmm3,%xmm2
467	pxor	%xmm2,%xmm1
468	movdqa	%xmm1,%xmm4
469	psrld	$25,%xmm1
470	pslld	$7,%xmm4
471	por	%xmm4,%xmm1
472	pshufd	$78,%xmm2,%xmm2
473	pshufd	$147,%xmm1,%xmm1
474	pshufd	$57,%xmm3,%xmm3
475	decl	%edx
476	jnz	.L009loop1x
477	paddd	(%esp),%xmm0
478	paddd	16(%esp),%xmm1
479	paddd	32(%esp),%xmm2
480	paddd	48(%esp),%xmm3
481	cmpl	$64,%ecx
482	jb	.L011tail
483	movdqu	(%esi),%xmm4
484	movdqu	16(%esi),%xmm5
485	pxor	%xmm4,%xmm0
486	movdqu	32(%esi),%xmm4
487	pxor	%xmm5,%xmm1
488	movdqu	48(%esi),%xmm5
489	pxor	%xmm4,%xmm2
490	pxor	%xmm5,%xmm3
491	leal	64(%esi),%esi
492	movdqu	%xmm0,(%edi)
493	movdqu	%xmm1,16(%edi)
494	movdqu	%xmm2,32(%edi)
495	movdqu	%xmm3,48(%edi)
496	leal	64(%edi),%edi
497	subl	$64,%ecx
498	jnz	.L010outer1x
499	jmp	.L012done
500.L011tail:
501	movdqa	%xmm0,(%esp)
502	movdqa	%xmm1,16(%esp)
503	movdqa	%xmm2,32(%esp)
504	movdqa	%xmm3,48(%esp)
505	xorl	%eax,%eax
506	xorl	%edx,%edx
507	xorl	%ebp,%ebp
508.L013tail_loop:
509	movb	(%esp,%ebp,1),%al
510	movb	(%esi,%ebp,1),%dl
511	leal	1(%ebp),%ebp
512	xorb	%dl,%al
513	movb	%al,-1(%edi,%ebp,1)
514	decl	%ecx
515	jnz	.L013tail_loop
516.L012done:
517	movl	512(%esp),%esp
518	popl	%edi
519	popl	%esi
520	popl	%ebx
521	popl	%ebp
522	ret
523.size	ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin
524.align	64
525.Lssse3_data:
526.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
527.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
528.long	1634760805,857760878,2036477234,1797285236
529.long	0,1,2,3
530.long	4,4,4,4
531.long	1,0,0,0
532.long	4,0,0,0
533.long	0,-1,-1,-1
534.align	64
535.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
536.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
537.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
538.byte	114,103,62,0
539.comm	OPENSSL_ia32cap_P,16,4
540
541	.section ".note.gnu.property", "a"
542	.p2align 2
543	.long 1f - 0f
544	.long 4f - 1f
545	.long 5
5460:
547	.asciz "GNU"
5481:
549	.p2align 2
550	.long 0xc0000002
551	.long 3f - 2f
5522:
553	.long 3
5543:
555	.p2align 2
5564:
557