xref: /netbsd-src/sys/arch/m68k/m68k/oc_cksum.s (revision 5ae0a955493425be9125d9bd3239f6810ff39e12)
1|	$NetBSD: oc_cksum.s,v 1.11 2023/09/26 14:33:55 tsutsui Exp $
2
3| Copyright (c) 1988 Regents of the University of California.
4| All rights reserved.
5|
6| Redistribution and use in source and binary forms, with or without
7| modification, are permitted provided that the following conditions
8| are met:
9| 1. Redistributions of source code must retain the above copyright
10|    notice, this list of conditions and the following disclaimer.
11| 2. Redistributions in binary form must reproduce the above copyright
12|    notice, this list of conditions and the following disclaimer in the
13|    documentation and/or other materials provided with the distribution.
14| 3. Neither the name of the University nor the names of its contributors
15|    may be used to endorse or promote products derived from this software
16|    without specific prior written permission.
17|
18| THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21| ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24| OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28| SUCH DAMAGE.
29|
30|	@(#)oc_cksum.s	7.2 (Berkeley) 11/3/90
31|
32|
33| oc_cksum: ones complement 16 bit checksum for MC68020.
34|
35| oc_cksum (buffer, count, strtval)
36|
37| Do a 16 bit one's complement sum of 'count' bytes from 'buffer'.
38| 'strtval' is the starting value of the sum (usually zero).
39|
40| It simplifies life in in_cksum if strtval can be >= 2^16.
41| This routine will work as long as strtval is < 2^31.
42|
43| Performance
44| -----------
45| This routine is intended for MC 68020s but should also work
46| for 68030s.  It (deliberately) doesn't worry about the alignment
47| of the buffer so will only work on a 68010 if the buffer is
48| aligned on an even address.  (Also, a routine written to use
49| 68010 "loop mode" would almost certainly be faster than this
50| code on a 68010).
51|
52| We don't worry about alignment because this routine is frequently
53| called with small counts: 20 bytes for IP header checksums and 40
54| bytes for TCP ack checksums.  For these small counts, testing for
55| bad alignment adds ~10% to the per-call cost.  Since, by the nature
56| of the kernel's allocator, the data we're called with is almost
57| always longword aligned, there is no benefit to this added cost
58| and we're better off letting the loop take a big performance hit
59| in the rare cases where we're handed an unaligned buffer.
60|
61| Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
62| tested on random data on four different types of processors (see
63| list below -- 64 was the largest unrolling because anything more
64| overflows the 68020 Icache).  On all the processors, the
65| throughput asymptote was located between 8 and 16 (closer to 8).
66| However, 16 was substantially better than 8 for small counts.
67| (It's clear why this happens for a count of 40: unroll-8 pays a
68| loop branch cost and unroll-16 doesn't.  But the tests also showed
69| that 16 was better than 8 for a count of 20.  It's not obvious to
70| me why.)  So, since 16 was good for both large and small counts,
71| the loop below is unrolled 16 times.
72|
73| The processors tested and their average time to checksum 1024 bytes
74| of random data were:
75|	Sun 3/50 (15MHz)	190 us/KB
76|	Sun 3/180 (16.6MHz)	175 us/KB
77|	Sun 3/60 (20MHz)	134 us/KB
78|	Sun 3/280 (25MHz)	 95 us/KB
79|
80| The cost of calling this routine was typically 10% of the per-
81| kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
82| and each additional byte cost 125ns.  With the high fixed cost,
83| it would clearly be a gain to "inline" this routine -- the
84| subroutine call adds 400% overhead to an IP header checksum.
85| However, in absolute terms, inlining would only gain 10us per
86| packet -- a 1% effect for a 1ms ethernet packet.  This is not
87| enough gain to be worth the effort.
88
89#include <m68k/asm.h>
90
91	.text
92
93ENTRY(oc_cksum)
94	movl	4(%sp),%a0	| get buffer ptr
95	movl	8(%sp),%d1	| get byte count
96	movl	12(%sp),%d0	| get starting value
97	movl	%d2,-(%sp)	| free a reg
98
99	| test for possible 1, 2 or 3 bytes of excess at end
100	| of buffer.  The usual case is no excess (the usual
101	| case is header checksums) so we give that the faster
102	| 'not taken' leg of the compare.  (We do the excess
103	| first because we're about the trash the low order
104	| bits of the count in d1.)
105
106	btst	#0,%d1
107	jne	.L5		| if one or three bytes excess
108	btst	#1,%d1
109	jne	.L7		| if two bytes excess
110.L1:
111#ifdef __mcoldfire__
112	movq	#-4,%d2		| mask to clear bottom two bits
113	andl	%d2,%d1		| longword truncate length
114	movl	%d1,%d2		| move length to d2
115	movl	%d1,%a1		| move length to a1
116	addl	%a0,%a1		| add start so a1 now points to end
117	movq	#0x3c,%d1	| then find fractions of a chunk
118	andl	%d1,%d2
119	negl	%d2
120	subl	%d1,%d1		| this can never carry so X is cleared
121#else
122	movl	%d1,%d2		| move to d2
123	lsrl	#6,%d1		| make cnt into # of 64 byte chunks
124	andl	#0x3c,%d2	| then find fractions of a chunk
125	negl	%d2
126	andb	#0xf,%cc	| clear X
127#endif
128	jmp	(.L3-.-2:b,%pc,%d2)
129.L2:
130	movl	(%a0)+,%d2
131	addxl	%d2,%d0
132	movl	(%a0)+,%d2
133	addxl	%d2,%d0
134	movl	(%a0)+,%d2
135	addxl	%d2,%d0
136	movl	(%a0)+,%d2
137	addxl	%d2,%d0
138	movl	(%a0)+,%d2
139	addxl	%d2,%d0
140	movl	(%a0)+,%d2
141	addxl	%d2,%d0
142	movl	(%a0)+,%d2
143	addxl	%d2,%d0
144	movl	(%a0)+,%d2
145	addxl	%d2,%d0
146	movl	(%a0)+,%d2
147	addxl	%d2,%d0
148	movl	(%a0)+,%d2
149	addxl	%d2,%d0
150	movl	(%a0)+,%d2
151	addxl	%d2,%d0
152	movl	(%a0)+,%d2
153	addxl	%d2,%d0
154	movl	(%a0)+,%d2
155	addxl	%d2,%d0
156	movl	(%a0)+,%d2
157	addxl	%d2,%d0
158	movl	(%a0)+,%d2
159	addxl	%d2,%d0
160	movl	(%a0)+,%d2
161	addxl	%d2,%d0
162.L3:
163#ifdef __mcoldfire__
164	cmpal	%a0,%a1		| cmpa doesn't affect X
165	bne	.L2		| loop until reached
166#else
167	dbra	%d1,.L2		| (NB- dbra doesn't affect X)
168#endif
169
170	movl	%d0,%d1		| fold 32 bit sum to 16 bits
171	swap	%d1		| (NB- swap doesn't affect X)
172#ifdef __mcoldfire__
173	mvzw	%d1,%d1		| zero extend %d1 (doesn't affect X)
174	mvzw	%d0,%d0		| zero extend %d0 (doesn't affect X)
175	addxl	%d1,%d0		|
176	jcc	.L4
177	addql	#1,%d0
178#else
179	addxw	%d1,%d0
180	jcc	.L4
181	addw	#1,%d0
182#endif
183.L4:
184#ifdef __mcoldfire__
185	mvzw	%d0,%d0
186#else
187	andl	#0xffff,%d0
188#endif
189	movl	(%sp)+,%d2
190	rts
191
192.L5:	| deal with 1 or 3 excess bytes at the end of the buffer.
193	btst	#1,%d1
194	jeq	.L6		| if 1 excess
195
196	| 3 bytes excess
197#ifdef __mcoldfire__
198	mvzw	(-3,%a0,%d1:l),%d2	| add in last full word then drop
199#else
200	clrl	%d2
201	movw	(-3,%a0,%d1:l),%d2	| add in last full word then drop
202#endif
203	addl	%d2,%d0		|  through to pick up last byte
204
205.L6:	| 1 byte excess
206#ifdef __mcoldfire__
207	mvzb	(-1,%a0,%d1:l),%d2
208#else
209	clrl	%d2
210	movb	(-1,%a0,%d1:l),%d2
211#endif
212	lsll	#8,%d2
213	addl	%d2,%d0
214	jra	.L1
215
216.L7:	| 2 bytes excess
217#ifdef __mcoldfire__
218	mvzw	(-2,%a0,%d1:l),%d2
219#else
220	clrl	%d2
221	movw	(-2,%a0,%d1:l),%d2
222#endif
223	addl	%d2,%d0
224	jra	.L1
225