xref: /netbsd-src/sys/arch/arm/arm/cpu_in_cksum.S (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1/*	$NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $	*/
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38/*
39 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
40 */
41
42#include <machine/asm.h>
43RCSID("$NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $")
44
45#include "assym.h"
46
47/*
48 * int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
49 *
50 * Entry:
51 *	r0	m
52 *	r1	len
53 *	r2	off
54 *	r3	initial_sum
55 *
56 * Function wide register usage
57 *	r8	accumulated sum
58 *	r9	remaining length to parse
59 *	ip	pointer to next mbuf
60 */
61/* LINTSTUB: Func: int cpu_in_cksum(struct mbuf *, int, int, uint32_t) */
62ENTRY(cpu_in_cksum)
63	stmfd	sp!, {r4-r11,lr}
64
65	mov	r8, r3			/* Accumulate sum in r8 */
66	mov	r9, r1			/* save len in r9 */
67	mov	ip, r0			/* set ip to the current mbuf */
68
69.Lin_cksum_skip_loop:
70	ldr	r1, [ip, #(M_LEN)]
71	ldr	r0, [ip, #(M_DATA)]
72	ldr	ip, [ip, #(M_NEXT)]
73.Lin_cksum_skip_entry:
74	subs	r2, r2, r1		/* offset = offset - mbuf length */
75	blt	.Lin_cksum_skip_done	/* if offset has gone negative start with this mbuf */
76	cmp	ip, #0x00
77	bne	.Lin_cksum_skip_loop
78	b	.Lin_cksum_whoops
79
80.Lin_cksum_skip_done:
81	add	r0, r2, r0		/* data += offset (offset is < 0) */
82	add	r0, r0, r1		/* data += length of mbuf */
83					/* data == start of data to cksum */
84	rsb	r1, r2, #0x00		/* length = remainder of mbuf to read */
85	mov	r10, #0x00
86	b	.Lin_cksum_entry
87
88.Lin_cksum_loop:
89	ldr	r1, [ip, #(M_LEN)]
90	ldr	r0, [ip, #(M_DATA)]
91	ldr	ip, [ip, #(M_NEXT)]
92.Lin_cksum_entry:
93	cmp	r9, r1
94	movlt	r1, r9
95	sub	r9, r9, r1
96	eor	r11, r10, r0
97	add	r10, r10, r1
98	adds	r2, r1, #0x00
99	blne	_ASM_LABEL(L_cksumdata)
100	tst	r11, #0x01
101	movne	r2, r2, ror #8
102	adds	r8, r8, r2
103	adc	r8, r8, #0x00
104	cmp	ip, #00
105	bne	.Lin_cksum_loop
106
107	mov	r1, #0xff
108	orr	r1, r1, #0xff00
109	and	r0, r8, r1
110	add	r0, r0, r8, lsr #16
111	add	r0, r0, r0, lsr #16
112	and	r0, r0, r1
113	eor	r0, r0, r1
114	ldmfd	sp!, {r4-r11, pc}
115
116.Lin_cksum_whoops:
117	adr	r0, .Lin_cksum_whoops_str
118	bl	_C_LABEL(panic)
119.Lin_cksum_whoops_str:
120	.asciz	"in_cksum: out of mbufs\n"
121	.align	5
122
123
124/*
125 * The main in*_cksum() workhorse...
126 *
127 * Entry parameters:
128 *	r0	Pointer to buffer
129 *	r1	Buffer length
130 *	lr	Return address
131 *
132 * Returns:
133 *	r2	Accumulated 32-bit sum
134 *
135 * Clobbers:
136 *	r0-r7
137 */
138/* LINTSTUB: Ignore */
139ASENTRY_NP(L_cksumdata)
140#ifdef __PROG26
141	str	lr, [sp, #-4]!		/* for SVC26 mode */
142#endif
143#ifdef __XSCALE__
144	pld	[r0]			/* Pre-fetch the start of the buffer */
145#endif
146	mov	r2, #0
147
148	/* We first have to word-align the buffer.  */
149	ands	r7, r0, #0x03
150	beq	.Lcksumdata_wordaligned
151	rsb	r7, r7, #0x04
152	cmp	r1, r7			/* Enough bytes left to make it? */
153	blt	.Lcksumdata_endgame
154	cmp	r7, #0x02
155	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
156	ldrgeb	r5, [r0], #0x01		/* Fetch 2nd byte */
157	movlt	r5, #0x00
158	ldrgtb	r6, [r0], #0x01		/* Fetch 3rd byte */
159	movle	r6, #0x00
160	/* Combine the three bytes depending on endianness and alignment */
161#ifdef __ARMEB__
162	orreq	r2, r5, r4, lsl #8
163	orreq	r2, r2, r6, lsl #24
164	orrne	r2, r4, r5, lsl #8
165	orrne	r2, r2, r6, lsl #16
166#else
167	orreq	r2, r4, r5, lsl #8
168	orreq	r2, r2, r6, lsl #16
169	orrne	r2, r5, r4, lsl #8
170	orrne	r2, r2, r6, lsl #24
171#endif
172	subs	r1, r1, r7		/* Update length */
173#ifdef __PROG26
174	ldreq	pc, [sp], #4		/* All done? */
175#else
176	moveq	pc, lr			/* All done? */
177#endif
178
179	/* Buffer is now word aligned */
180.Lcksumdata_wordaligned:
181#ifdef __XSCALE__
182	cmp	r1, #0x04		/* Less than 4 bytes left? */
183	blt	.Lcksumdata_endgame	/* Yup */
184
185	/* Now quad-align, if necessary */
186	ands	r7, r0, #0x04
187	ldrne	r7, [r0], #0x04
188	subne	r1, r1, #0x04
189	subs	r1, r1, #0x40
190	blt	.Lcksumdata_bigloop_end	/* Note: C flag clear if branch taken */
191
192	/*
193	 * Buffer is now quad aligned. Sum 64 bytes at a time.
194	 * Note: First ldrd is hoisted above the loop, together with
195	 * setting r6 to zero to avoid stalling for results in the
196	 * loop. (r7 is live, from above).
197	 */
198	ldrd	r4, [r0], #0x08
199	mov	r6, #0x00
200.Lcksumdata_bigloop:
201	pld	[r0, #0x18]
202	adds	r2, r2, r6
203	adcs	r2, r2, r7
204	ldrd	r6, [r0], #0x08
205	adcs	r2, r2, r4
206	adcs	r2, r2, r5
207	ldrd	r4, [r0], #0x08
208	adcs	r2, r2, r6
209	adcs	r2, r2, r7
210	ldrd	r6, [r0], #0x08
211	adcs	r2, r2, r4
212	adcs	r2, r2, r5
213	ldrd	r4, [r0], #0x08
214	adcs	r2, r2, r6
215	adcs	r2, r2, r7
216	pld	[r0, #0x18]
217	ldrd	r6, [r0], #0x08
218	adcs	r2, r2, r4
219	adcs	r2, r2, r5
220	ldrd	r4, [r0], #0x08
221	adcs	r2, r2, r6
222	adcs	r2, r2, r7
223	ldrd	r6, [r0], #0x08
224	adcs	r2, r2, r4
225	adcs	r2, r2, r5
226	adc	r2, r2, #0x00
227	subs	r1, r1, #0x40
228	ldrged	r4, [r0], #0x08
229	bge	.Lcksumdata_bigloop
230
231	adds	r2, r2, r6		/* r6/r7 still need summing */
232.Lcksumdata_bigloop_end:
233	adcs	r2, r2, r7
234	adc	r2, r2, #0x00
235
236#else	/* !__XSCALE__ */
237
238	subs	r1, r1, #0x40
239	blt	.Lcksumdata_bigloop_end
240
241.Lcksumdata_bigloop:
242	ldmia	r0!, {r3, r4, r5, r6}
243	adds	r2, r2, r3
244	adcs	r2, r2, r4
245	adcs	r2, r2, r5
246	ldmia	r0!, {r3, r4, r5, r7}
247	adcs	r2, r2, r6
248	adcs	r2, r2, r3
249	adcs	r2, r2, r4
250	adcs	r2, r2, r5
251	ldmia	r0!, {r3, r4, r5, r6}
252	adcs	r2, r2, r7
253	adcs	r2, r2, r3
254	adcs	r2, r2, r4
255	adcs	r2, r2, r5
256	ldmia	r0!, {r3, r4, r5, r7}
257	adcs	r2, r2, r6
258	adcs	r2, r2, r3
259	adcs	r2, r2, r4
260	adcs	r2, r2, r5
261	adcs	r2, r2, r7
262	adc	r2, r2, #0x00
263	subs	r1, r1, #0x40
264	bge	.Lcksumdata_bigloop
265.Lcksumdata_bigloop_end:
266#endif
267
268	adds	r1, r1, #0x40
269#ifdef __PROG26
270	ldreq	pc, [sp], #4
271#else
272	moveq	pc, lr
273#endif
274	cmp	r1, #0x20
275
276#ifdef __XSCALE__
277	ldrged	r4, [r0], #0x08		/* Avoid stalling pld and result */
278	blt	.Lcksumdata_less_than_32
279	pld	[r0, #0x18]
280	ldrd	r6, [r0], #0x08
281	adds	r2, r2, r4
282	adcs	r2, r2, r5
283	ldrd	r4, [r0], #0x08
284	adcs	r2, r2, r6
285	adcs	r2, r2, r7
286	ldrd	r6, [r0], #0x08
287	adcs	r2, r2, r4
288	adcs	r2, r2, r5
289	adcs	r2, r2, r6		/* XXX: Unavoidable result stall */
290	adcs	r2, r2, r7
291#else
292	blt	.Lcksumdata_less_than_32
293	ldmia	r0!, {r3, r4, r5, r6}
294	adds	r2, r2, r3
295	adcs	r2, r2, r4
296	adcs	r2, r2, r5
297	ldmia	r0!, {r3, r4, r5, r7}
298	adcs	r2, r2, r6
299	adcs	r2, r2, r3
300	adcs	r2, r2, r4
301	adcs	r2, r2, r5
302	adcs	r2, r2, r7
303#endif
304	adc	r2, r2, #0x00
305	subs	r1, r1, #0x20
306#ifdef __PROG26
307	ldreq	pc, [sp], #4
308#else
309	moveq	pc, lr
310#endif
311
312.Lcksumdata_less_than_32:
313	/* There are less than 32 bytes left */
314	and	r3, r1, #0x18
315	rsb	r4, r3, #0x18
316	sub	r1, r1, r3
317	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
318	addne	pc, pc, r4
319
320/*
321 * Note: We use ldm here, even on Xscale, since the combined issue/result
322 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
323 */
324	/* At least 24 bytes remaining... */
325	ldmia	r0!, {r4, r5}
326	nop
327	adcs	r2, r2, r4
328	adcs	r2, r2, r5
329
330	/* At least 16 bytes remaining... */
331	ldmia	r0!, {r4, r5}
332	adcs	r2, r2, r4
333	adcs	r2, r2, r5
334
335	/* At least 8 bytes remaining... */
336	ldmia	r0!, {r4, r5}
337	adcs	r2, r2, r4
338	adcs	r2, r2, r5
339
340	/* Less than 8 bytes remaining... */
341	adc	r2, r2, #0x00
342	subs	r1, r1, #0x04
343	blt	.Lcksumdata_lessthan4
344
345	ldr	r4, [r0], #0x04
346	sub	r1, r1, #0x04
347	adds	r2, r2, r4
348	adc	r2, r2, #0x00
349
350	/* Deal with < 4 bytes remaining */
351.Lcksumdata_lessthan4:
352	adds	r1, r1, #0x04
353#ifdef __PROG26
354	ldreq	pc, [sp], #4
355#else
356	moveq	pc, lr
357#endif
358
359	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
360.Lcksumdata_endgame:
361	ldrb	r3, [r0]		/* Fetch first byte */
362	cmp	r1, #0x02
363	ldrgeb	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
364	movlt	r4, #0x00
365	ldrgtb	r5, [r0, #0x02]
366	movle	r5, #0x00
367	/* Combine the three bytes depending on endianness and alignment */
368	tst	r0, #0x01
369#ifdef __ARMEB__
370	orreq	r3, r4, r3, lsl #8
371	orreq	r3, r3, r5, lsl #24
372	orrne	r3, r3, r4, lsl #8
373	orrne	r3, r3, r5, lsl #16
374#else
375	orreq	r3, r3, r4, lsl #8
376	orreq	r3, r3, r5, lsl #16
377	orrne	r3, r4, r3, lsl #8
378	orrne	r3, r3, r5, lsl #24
379#endif
380	adds	r2, r2, r3
381	adc	r2, r2, #0x00
382#ifdef __PROG26
383	ldr	pc, [sp], #4
384#else
385	mov	pc, lr
386#endif
387