xref: /netbsd-src/sys/netinet/cpu_in_cksum.c (revision 6745eb64766a734e107bb754d8eecbb11b04708a)
1 /*	$NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $	*/
2 /*-
3  * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
21  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $");
33 
34 #include <sys/param.h>
35 #include <sys/endian.h>
36 #include <sys/mbuf.h>
37 #ifdef _KERNEL
38 #include <sys/systm.h>
39 #else
40 #include <assert.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43 
44 #define KASSERT(x) assert(x)
45 #endif
46 
47 #include <machine/limits.h>
48 
49 #include <netinet/in.h>
50 
51 #ifndef _KERNEL
52 int	cpu_in_cksum(struct mbuf*, int, int, uint32_t);
53 #endif
54 
55 /*
56  * Checksum routine for Internet Protocol family headers (Portable Version).
57  *
58  * This routine is very heavily used in the network
59  * code and should be modified for each CPU to be as fast as possible.
60  *
61  * A discussion of different implementation techniques can be found in
62  * RFC 1071.
63  *
64  * The default implementation for 32bit architectures is using
65  * a 32bit accumulator and operating on 16bit operands.
66  *
67  * The default implementation for 64bit architectures is using
68  * a 64bit accumulator and operating on 32bit operands.
69  *
70  * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
71  * of the inner loop. After each iteration of the inner loop, a partial
72  * reduction is done to avoid carry in long packets.
73  */
74 
75 #if ULONG_MAX == 0xffffffffUL
76 /* 32bit version */
77 int
cpu_in_cksum(struct mbuf * m,int len,int off,uint32_t initial_sum)78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
79 {
80 	int mlen;
81 	uint32_t sum, partial;
82 	unsigned int final_acc;
83 	uint8_t *data;
84 	bool needs_swap, started_on_odd;
85 
86 	KASSERT(len >= 0);
87 	KASSERT(off >= 0);
88 
89 	needs_swap = false;
90 	started_on_odd = false;
91 	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
92 
93 	for (;;) {
94 		if (__predict_false(m == NULL)) {
95 			printf("in_cksum: out of data\n");
96 			return -1;
97 		}
98 		mlen = m->m_len;
99 		if (mlen > off) {
100 			mlen -= off;
101 			data = mtod(m, uint8_t *) + off;
102 			goto post_initial_offset;
103 		}
104 		off -= mlen;
105 		if (len == 0)
106 			break;
107 		m = m->m_next;
108 	}
109 
110 	for (; len > 0; m = m->m_next) {
111 		if (__predict_false(m == NULL)) {
112 			printf("in_cksum: out of data\n");
113 			return -1;
114 		}
115 		mlen = m->m_len;
116 		data = mtod(m, uint8_t *);
117  post_initial_offset:
118 		if (mlen == 0)
119 			continue;
120 		if (mlen > len)
121 			mlen = len;
122 		len -= mlen;
123 
124 		partial = 0;
125 		if ((uintptr_t)data & 1) {
126 			/* Align on word boundary */
127 			started_on_odd = !started_on_odd;
128 #if _BYTE_ORDER == _LITTLE_ENDIAN
129 			partial = *data << 8;
130 #else
131 			partial = *data;
132 #endif
133 			++data;
134 			--mlen;
135 		}
136 		needs_swap = started_on_odd;
137 		while (mlen >= 32) {
138 			__builtin_prefetch(data + 32);
139 			partial += *(uint16_t *)data;
140 			partial += *(uint16_t *)(data + 2);
141 			partial += *(uint16_t *)(data + 4);
142 			partial += *(uint16_t *)(data + 6);
143 			partial += *(uint16_t *)(data + 8);
144 			partial += *(uint16_t *)(data + 10);
145 			partial += *(uint16_t *)(data + 12);
146 			partial += *(uint16_t *)(data + 14);
147 			partial += *(uint16_t *)(data + 16);
148 			partial += *(uint16_t *)(data + 18);
149 			partial += *(uint16_t *)(data + 20);
150 			partial += *(uint16_t *)(data + 22);
151 			partial += *(uint16_t *)(data + 24);
152 			partial += *(uint16_t *)(data + 26);
153 			partial += *(uint16_t *)(data + 28);
154 			partial += *(uint16_t *)(data + 30);
155 			data += 32;
156 			mlen -= 32;
157 			if (__predict_false(partial & 0xc0000000)) {
158 				if (needs_swap)
159 					partial = (partial << 8) + (partial >> 24);
160 				sum += (partial >> 16);
161 				sum += (partial & 0xffff);
162 				partial = 0;
163 			}
164 		}
165 		/*
166 		 * mlen is not updated below as the remaining tests
167 		 * are using bit masks, which are not affected.
168 		 */
169 		if (mlen & 16) {
170 			partial += *(uint16_t *)data;
171 			partial += *(uint16_t *)(data + 2);
172 			partial += *(uint16_t *)(data + 4);
173 			partial += *(uint16_t *)(data + 6);
174 			partial += *(uint16_t *)(data + 8);
175 			partial += *(uint16_t *)(data + 10);
176 			partial += *(uint16_t *)(data + 12);
177 			partial += *(uint16_t *)(data + 14);
178 			data += 16;
179 		}
180 		if (mlen & 8) {
181 			partial += *(uint16_t *)data;
182 			partial += *(uint16_t *)(data + 2);
183 			partial += *(uint16_t *)(data + 4);
184 			partial += *(uint16_t *)(data + 6);
185 			data += 8;
186 		}
187 		if (mlen & 4) {
188 			partial += *(uint16_t *)data;
189 			partial += *(uint16_t *)(data + 2);
190 			data += 4;
191 		}
192 		if (mlen & 2) {
193 			partial += *(uint16_t *)data;
194 			data += 2;
195 		}
196 		if (mlen & 1) {
197 #if _BYTE_ORDER == _LITTLE_ENDIAN
198 			partial += *data;
199 #else
200 			partial += *data << 8;
201 #endif
202 			started_on_odd = !started_on_odd;
203 		}
204 
205 		if (needs_swap)
206 			partial = (partial << 8) + (partial >> 24);
207 		sum += (partial >> 16) + (partial & 0xffff);
208 		/*
209 		 * Reduce sum to allow potential byte swap
210 		 * in the next iteration without carry.
211 		 */
212 		sum = (sum >> 16) + (sum & 0xffff);
213 	}
214 	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
215 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
216 	return ~final_acc & 0xffff;
217 }
218 
219 #else
220 /* 64bit version */
221 int
cpu_in_cksum(struct mbuf * m,int len,int off,uint32_t initial_sum)222 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
223 {
224 	int mlen;
225 	uint64_t sum, partial;
226 	unsigned int final_acc;
227 	uint8_t *data;
228 	bool needs_swap, started_on_odd;
229 
230 	KASSERT(len >= 0);
231 	KASSERT(off >= 0);
232 
233 	needs_swap = false;
234 	started_on_odd = false;
235 	sum = initial_sum;
236 
237 	for (;;) {
238 		if (__predict_false(m == NULL)) {
239 			printf("in_cksum: out of data\n");
240 			return -1;
241 		}
242 		mlen = m->m_len;
243 		if (mlen > off) {
244 			mlen -= off;
245 			data = mtod(m, uint8_t *) + off;
246 			goto post_initial_offset;
247 		}
248 		off -= mlen;
249 		if (len == 0)
250 			break;
251 		m = m->m_next;
252 	}
253 
254 	for (; len > 0; m = m->m_next) {
255 		if (__predict_false(m == NULL)) {
256 			printf("in_cksum: out of data\n");
257 			return -1;
258 		}
259 		mlen = m->m_len;
260 		data = mtod(m, uint8_t *);
261  post_initial_offset:
262 		if (mlen == 0)
263 			continue;
264 		if (mlen > len)
265 			mlen = len;
266 		len -= mlen;
267 
268 		partial = 0;
269 		if ((uintptr_t)data & 1) {
270 			/* Align on word boundary */
271 			started_on_odd = !started_on_odd;
272 #if _BYTE_ORDER == _LITTLE_ENDIAN
273 			partial = *data << 8;
274 #else
275 			partial = *data;
276 #endif
277 			++data;
278 			--mlen;
279 		}
280 		needs_swap = started_on_odd;
281 		if ((uintptr_t)data & 2) {
282 			if (mlen < 2)
283 				goto trailing_bytes;
284 			partial += *(uint16_t *)data;
285 			data += 2;
286 			mlen -= 2;
287 		}
288 		while (mlen >= 64) {
289 			__builtin_prefetch(data + 32);
290 			__builtin_prefetch(data + 64);
291 			partial += *(uint32_t *)data;
292 			partial += *(uint32_t *)(data + 4);
293 			partial += *(uint32_t *)(data + 8);
294 			partial += *(uint32_t *)(data + 12);
295 			partial += *(uint32_t *)(data + 16);
296 			partial += *(uint32_t *)(data + 20);
297 			partial += *(uint32_t *)(data + 24);
298 			partial += *(uint32_t *)(data + 28);
299 			partial += *(uint32_t *)(data + 32);
300 			partial += *(uint32_t *)(data + 36);
301 			partial += *(uint32_t *)(data + 40);
302 			partial += *(uint32_t *)(data + 44);
303 			partial += *(uint32_t *)(data + 48);
304 			partial += *(uint32_t *)(data + 52);
305 			partial += *(uint32_t *)(data + 56);
306 			partial += *(uint32_t *)(data + 60);
307 			data += 64;
308 			mlen -= 64;
309 			if (__predict_false(partial & (3ULL << 62))) {
310 				if (needs_swap)
311 					partial = (partial << 8) + (partial >> 56);
312 				sum += (partial >> 32);
313 				sum += (partial & 0xffffffff);
314 				partial = 0;
315 			}
316 		}
317 		/*
318 		 * mlen is not updated below as the remaining tests
319 		 * are using bit masks, which are not affected.
320 		 */
321 		if (mlen & 32) {
322 			partial += *(uint32_t *)data;
323 			partial += *(uint32_t *)(data + 4);
324 			partial += *(uint32_t *)(data + 8);
325 			partial += *(uint32_t *)(data + 12);
326 			partial += *(uint32_t *)(data + 16);
327 			partial += *(uint32_t *)(data + 20);
328 			partial += *(uint32_t *)(data + 24);
329 			partial += *(uint32_t *)(data + 28);
330 			data += 32;
331 		}
332 		if (mlen & 16) {
333 			partial += *(uint32_t *)data;
334 			partial += *(uint32_t *)(data + 4);
335 			partial += *(uint32_t *)(data + 8);
336 			partial += *(uint32_t *)(data + 12);
337 			data += 16;
338 		}
339 		if (mlen & 8) {
340 			partial += *(uint32_t *)data;
341 			partial += *(uint32_t *)(data + 4);
342 			data += 8;
343 		}
344 		if (mlen & 4) {
345 			partial += *(uint32_t *)data;
346 			data += 4;
347 		}
348 		if (mlen & 2) {
349 			partial += *(uint16_t *)data;
350 			data += 2;
351 		}
352  trailing_bytes:
353 		if (mlen & 1) {
354 #if _BYTE_ORDER == _LITTLE_ENDIAN
355 			partial += *data;
356 #else
357 			partial += *data << 8;
358 #endif
359 			started_on_odd = !started_on_odd;
360 		}
361 
362 		if (needs_swap)
363 			partial = (partial << 8) + (partial >> 56);
364 		sum += (partial >> 32) + (partial & 0xffffffff);
365 		/*
366 		 * Reduce sum to allow potential byte swap
367 		 * in the next iteration without carry.
368 		 */
369 		sum = (sum >> 32) + (sum & 0xffffffff);
370 	}
371 	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
372 	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
373 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
374 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
375 	return ~final_acc & 0xffff;
376 }
377 #endif
378