xref: /netbsd-src/sys/arch/arm/arm/cpu_in_cksum.S (revision 80d9064ac03cbb6a4174695f0d5b237c8766d3d0)
1/*	$NetBSD: cpu_in_cksum.S,v 1.8 2013/12/22 16:29:42 matt Exp $	*/
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38/*
39 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
40 */
41
42#include <machine/asm.h>
43RCSID("$NetBSD: cpu_in_cksum.S,v 1.8 2013/12/22 16:29:42 matt Exp $")
44
45#include "assym.h"
46
47/*
48 * int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
49 *
50 * Entry:
51 *	r0	m
52 *	r1	len
53 *	r2	off
54 *	r3	initial_sum
55 *
56 * Function wide register usage
57 *	r8	accumulated sum
58 *	r9	remaining length to parse
59 *	ip	pointer to next mbuf
60 */
61/* LINTSTUB: Func: int cpu_in_cksum(struct mbuf *, int, int, uint32_t) */
62ENTRY(cpu_in_cksum)
63	push	{r4-r11,lr}
64
65	mov	r8, r3			/* Accumulate sum in r8 */
66	mov	r9, r1			/* save len in r9 */
67	mov	ip, r0			/* set ip to the current mbuf */
68
69.Lin_cksum_skip_loop:
70	ldr	r1, [ip, #(M_LEN)]
71	ldr	r0, [ip, #(M_DATA)]
72	ldr	ip, [ip, #(M_NEXT)]
73.Lin_cksum_skip_entry:
74	subs	r2, r2, r1		/* offset = offset - mbuf length */
75	blt	.Lin_cksum_skip_done	/* if offset has gone negative start with this mbuf */
76	cmp	ip, #0x00
77	bne	.Lin_cksum_skip_loop
78	b	.Lin_cksum_whoops
79
80.Lin_cksum_skip_done:
81	add	r0, r2, r0		/* data += offset (offset is < 0) */
82	add	r0, r0, r1		/* data += length of mbuf */
83					/* data == start of data to cksum */
84	rsb	r1, r2, #0x00		/* length = remainder of mbuf to read */
85	mov	r10, #0x00
86	b	.Lin_cksum_entry
87
88.Lin_cksum_loop:
89	ldr	r1, [ip, #(M_LEN)]
90	ldr	r0, [ip, #(M_DATA)]
91	ldr	ip, [ip, #(M_NEXT)]
92.Lin_cksum_entry:
93	cmp	r9, r1
94#ifdef __thumb__
95	bge	1f
96	mov	r1, r9
97#else
98	movlt	r1, r9
99#endif
1001:	sub	r9, r9, r1
101	eor	r11, r10, r0
102	add	r10, r10, r1
103	adds	r2, r1, #0x00
104#ifdef __thumb__
105	it	ne
106#endif
107	blne	_ASM_LABEL(arm_cksumdata)
108	tst	r11, #0x01
109#ifdef __thumb__
110	it	ne
111#endif
112	movne	r2, r2, ror #8
113	adds	r8, r8, r2
114	adc	r8, r8, #0x00
115	cmp	ip, #00
116	bne	.Lin_cksum_loop
117
118#ifdef __thumb__
119	mov	r0, r8
120	lsls	r2, r0, #16
121	adds	r0, r0, r2
122	bcc	1f
123	adds	r0, r0, #65536
1241:	mvns	r0, r0
125	lsrs	r0, r0, #16
126#else
127	adds	r8, r8, r8, lsl #16
128	addcs	r8, r8, #65536
129	mvn	r0, r8
130	lsr	r0, r0, #16
131#endif
132	pop	{r4-r11, pc}
133
134.Lin_cksum_whoops:
135	adr	r0, .Lin_cksum_whoops_str
136	bl	_C_LABEL(panic)
137.Lin_cksum_whoops_str:
138	.asciz	"in_cksum: out of mbufs\n"
139	.p2align	5
140END(cpu_in_cksum)
141
142
143/*
144 * The main in*_cksum() workhorse...
145 *
146 * Entry parameters:
147 *	r0	Pointer to buffer
148 *	r1	Buffer length
149 *	lr	Return address
150 *
151 * Returns:
152 *	r2	Accumulated 32-bit sum
153 *
154 * Clobbers:
155 *	r0-r7
156 */
157/* LINTSTUB: Ignore */
158ASENTRY_NP(arm_cksumdata)
159#ifdef __PROG26
160	str	lr, [sp, #-4]!		/* for SVC26 mode */
161#endif
162#ifdef __XSCALE__
163	pld	[r0]			/* Pre-fetch the start of the buffer */
164#endif
165	movs	r2, #0
166
167	/* We first have to word-align the buffer.  */
168	ands	r7, r0, #0x03
169	beq	.Lcksumdata_wordaligned
170	eors	r0, r0, r7		/* r0 is word aligned */
171	ldr	r2, [r0], #0x04
172#ifdef __thumb__
173	movs	r4, r7
174	lsls	r4, r4, #3
175#else
176	lsl	r4, r7, #3
177#endif
178#if defined(__ARMEB__)
179	lsls	r2, r2, r4
180	lsrs	r2, r2, r4
181#else
182	lsrs	r2, r2, r4
183	lsls	r2, r2, r4
184#endif
185	rsb	r7, r7, #0x04
186	subs	r1, r1, r7		/* Enough bytes left to make it? */
187	bgt	.Lcksumdata_wordaligned
188#ifdef __PROG26
189	ldreq	pc, [sp], #4		/* done */
190#else
191	RETc(eq)			/* done */
192#endif
193	adds	r7, r7, r1		/* undo sub */
194	adds	r7, r7, r1		/* r7 = offset + len */
195	rsb	r7, r7, #4
196	lsls	r7, r7, #3
197#if defined(__ARMEB__)
198	lsrs	r2, r2, r7
199	lsls	r2, r2, r7
200#else
201	lsls	r2, r2, r7
202	lsrs	r2, r2, r7
203#endif
204#ifdef __PROG26
205	ldr	pc, [sp], #4		/* done */
206#else
207	RET				/* done */
208#endif
209
210	/* Buffer is now word aligned */
211.Lcksumdata_wordaligned:
212#ifdef __XSCALE__
213	cmp	r1, #0x04		/* Less than 4 bytes left? */
214	blt	.Lcksumdata_endgame	/* Yup */
215
216	/* Now quad-align, if necessary */
217	ands	r7, r0, #0x04
218	ldrne	r7, [r0], #0x04
219	subne	r1, r1, #0x04
220	subs	r1, r1, #0x40
221	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
222
223	/*
224	 * Buffer is now quad aligned. Sum 64 bytes at a time.
225	 * Note: First ldrd is hoisted above the loop, together with
226	 * setting r6 to zero to avoid stalling for results in the
227	 * loop. (r7 is live, from above).
228	 */
229	ldrd	r4, r5, [r0], #0x08
230	mov	r6, #0x00
231.Lcksumdata_bigloop:
232	pld	[r0, #0x18]
233	adds	r2, r2, r6
234	adcs	r2, r2, r7
235	ldrd	r6, r7, [r0], #0x08
236	adcs	r2, r2, r4
237	adcs	r2, r2, r5
238	ldrd	r4, r5, [r0], #0x08
239	adcs	r2, r2, r6
240	adcs	r2, r2, r7
241	ldrd	r6, r7, [r0], #0x08
242	adcs	r2, r2, r4
243	adcs	r2, r2, r5
244	ldrd	r4, r5, [r0], #0x08
245	adcs	r2, r2, r6
246	adcs	r2, r2, r7
247	pld	[r0, #0x18]
248	ldrd	r6, r7, [r0], #0x08
249	adcs	r2, r2, r4
250	adcs	r2, r2, r5
251	ldrd	r4, r5, [r0], #0x08
252	adcs	r2, r2, r6
253	adcs	r2, r2, r7
254	ldrd	r6, r7, [r0], #0x08
255	adcs	r2, r2, r4
256	adcs	r2, r2, r5
257	adcs	r2, r2, #0x00
258	subs	r1, r1, #0x40
259	ldrdge	r4, r5, [r0], #0x08
260	bge	.Lcksumdata_bigloop
261
262	adds	r2, r2, r6		/* r6/r7 still need summing */
263.Lcksumdata_bigloop_end:
264	adcs	r2, r2, r7
265	adcs	r2, r2, #0x00
266
267#else	/* !__XSCALE__ */
268
269	subs	r1, r1, #0x40
270	blt	.Lcksumdata_bigloop_end
271
272.Lcksumdata_bigloop:
273	ldmia	r0!, {r3, r4, r5, r6}
274	adds	r2, r2, r3
275	adcs	r2, r2, r4
276	adcs	r2, r2, r5
277	ldmia	r0!, {r3, r4, r5, r7}
278	adcs	r2, r2, r6
279	adcs	r2, r2, r3
280	adcs	r2, r2, r4
281	adcs	r2, r2, r5
282	ldmia	r0!, {r3, r4, r5, r6}
283	adcs	r2, r2, r7
284	adcs	r2, r2, r3
285	adcs	r2, r2, r4
286	adcs	r2, r2, r5
287	ldmia	r0!, {r3, r4, r5, r7}
288	adcs	r2, r2, r6
289	adcs	r2, r2, r3
290	adcs	r2, r2, r4
291	adcs	r2, r2, r5
292	adcs	r2, r2, r7
293	adcs	r2, r2, #0x00
294	subs	r1, r1, #0x40
295	bge	.Lcksumdata_bigloop
296.Lcksumdata_bigloop_end:
297#endif
298
299	adds	r1, r1, #0x40
300#ifdef __PROG26
301	ldreq	pc, [sp], #4
302#else
303	RETc(eq)
304#endif
305	cmp	r1, #0x20
306
307#ifdef __XSCALE__
308	ldrdge	r4, r5, [r0], #0x08	/* Avoid stalling pld and result */
309	blt	.Lcksumdata_less_than_32
310	pld	[r0, #0x18]
311	ldrd	r6, r7, [r0], #0x08
312	adds	r2, r2, r4
313	adcs	r2, r2, r5
314	ldrd	r4, r5, [r0], #0x08
315	adcs	r2, r2, r6
316	adcs	r2, r2, r7
317	ldrd	r6, r7, [r0], #0x08
318	adcs	r2, r2, r4
319	adcs	r2, r2, r5
320	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
321	adcs	r2, r2, r7
322#else
323	blt	.Lcksumdata_less_than_32
324	ldmia	r0!, {r3, r4, r5, r6}
325	adds	r2, r2, r3
326	adcs	r2, r2, r4
327	adcs	r2, r2, r5
328	ldmia	r0!, {r3, r4, r5, r7}
329	adcs	r2, r2, r6
330	adcs	r2, r2, r3
331	adcs	r2, r2, r4
332	adcs	r2, r2, r5
333	adcs	r2, r2, r7
334#endif
335	adcs	r2, r2, #0x00
336	subs	r1, r1, #0x20
337#ifdef __PROG26
338	ldreq	pc, [sp], #4
339#else
340	RETc(eq)
341#endif
342
343.Lcksumdata_less_than_32:
344	/* There are less than 32 bytes left */
345	and	r3, r1, #0x18
346	rsb	r4, r3, #0x18
347	subs	r1, r1, r3
348	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
349#ifdef __thumb__
350	it	ne
351#endif
352	addne	pc, pc, r4
353
354/*
355 * Note: We use ldm here, even on Xscale, since the combined issue/result
356 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
357 */
358	/* At least 24 bytes remaining... */
359	ldmia	r0!, {r4, r5}
360	nop
361	adcs	r2, r2, r4
362	adcs	r2, r2, r5
363
364	/* At least 16 bytes remaining... */
365	ldmia	r0!, {r4, r5}
366	adcs	r2, r2, r4
367	adcs	r2, r2, r5
368
369	/* At least 8 bytes remaining... */
370	ldmia	r0!, {r4, r5}
371	adcs	r2, r2, r4
372	adcs	r2, r2, r5
373
374	/* Less than 8 bytes remaining... */
375	adcs	r2, r2, #0x00
376	subs	r1, r1, #0x04
377	blt	.Lcksumdata_lessthan4
378
379	ldr	r4, [r0], #0x04
380	subs	r1, r1, #0x04
381	adds	r2, r2, r4
382	adcs	r2, r2, #0x00
383
384	/* Deal with < 4 bytes remaining */
385.Lcksumdata_lessthan4:
386	adds	r1, r1, #0x04
387#ifdef __PROG26
388	ldreq	pc, [sp], #4
389#else
390	RETc(eq)
391#endif
392
393	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
394.Lcksumdata_endgame:
395	ldr	r3, [r0]		/* Fetch last word */
396	rsb	r1, r1, #4		/* get discard amount */
397	lsl	r1, r1, #3		/* turn it into bits */
398#ifdef __ARMEB__
399	lsr	r3, r3, r1		/* discard least significant bits */
400	lsl	r3, r3, r1		/* shift back filling with zeros */
401#else
402	lsl	r3, r3, r1		/* discard least significant bits */
403	lsr	r3, r3, r1		/* shift back filling with zeros */
404#endif
405	adds	r2, r2, r3
406	adcs	r2, r2, #0x00
407#ifdef __PROG26
408	ldr	pc, [sp], #4
409#else
410	RET
411#endif
412ASEND(arm_cksumdata)
413