1 /* $NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $ */ 2 /*- 3 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $"); 33 34 #include <sys/param.h> 35 #include <sys/endian.h> 36 #include <sys/mbuf.h> 37 #ifdef _KERNEL 38 #include <sys/systm.h> 39 #else 40 #include <assert.h> 41 #include <stdbool.h> 42 #include <stdio.h> 43 44 #define KASSERT(x) assert(x) 45 #endif 46 47 #include <machine/limits.h> 48 49 #include <netinet/in.h> 50 51 #ifndef _KERNEL 52 int cpu_in_cksum(struct mbuf*, int, int, uint32_t); 53 #endif 54 55 /* 56 * Checksum routine for Internet Protocol family headers (Portable Version). 57 * 58 * This routine is very heavily used in the network 59 * code and should be modified for each CPU to be as fast as possible. 60 * 61 * A discussion of different implementation techniques can be found in 62 * RFC 1071. 63 * 64 * The default implementation for 32bit architectures is using 65 * a 32bit accumulator and operating on 16bit operands. 66 * 67 * The default implementation for 64bit architectures is using 68 * a 64bit accumulator and operating on 32bit operands. 69 * 70 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core 71 * of the inner loop. After each iteration of the inner loop, a partial 72 * reduction is done to avoid carry in long packets. 73 */ 74 75 #if ULONG_MAX == 0xffffffffUL 76 /* 32bit version */ 77 int 78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) 79 { 80 int mlen; 81 uint32_t sum, partial; 82 unsigned int final_acc; 83 uint8_t *data; 84 bool needs_swap, started_on_odd; 85 86 KASSERT(len >= 0); 87 KASSERT(off >= 0); 88 89 needs_swap = false; 90 started_on_odd = false; 91 sum = (initial_sum >> 16) + (initial_sum & 0xffff); 92 93 for (;;) { 94 if (__predict_false(m == NULL)) { 95 printf("in_cksum: out of data\n"); 96 return -1; 97 } 98 mlen = m->m_len; 99 if (mlen > off) { 100 mlen -= off; 101 data = mtod(m, uint8_t *) + off; 102 goto post_initial_offset; 103 } 104 off -= mlen; 105 if (len == 0) 106 break; 107 m = m->m_next; 108 } 109 110 for (; len > 0; m = m->m_next) { 111 if (__predict_false(m == NULL)) { 112 printf("in_cksum: out of data\n"); 113 return -1; 114 } 115 mlen = m->m_len; 116 data = mtod(m, uint8_t *); 117 post_initial_offset: 118 if (mlen == 0) 119 continue; 120 if (mlen > len) 121 mlen = len; 122 len -= mlen; 123 124 partial = 0; 125 if ((uintptr_t)data & 1) { 126 /* Align on word boundary */ 127 started_on_odd = !started_on_odd; 128 #if _BYTE_ORDER == _LITTLE_ENDIAN 129 partial = *data << 8; 130 #else 131 partial = *data; 132 #endif 133 ++data; 134 --mlen; 135 } 136 needs_swap = started_on_odd; 137 while (mlen >= 32) { 138 __builtin_prefetch(data + 32); 139 partial += *(uint16_t *)data; 140 partial += *(uint16_t *)(data + 2); 141 partial += *(uint16_t *)(data + 4); 142 partial += *(uint16_t *)(data + 6); 143 partial += *(uint16_t *)(data + 8); 144 partial += *(uint16_t *)(data + 10); 145 partial += *(uint16_t *)(data + 12); 146 partial += *(uint16_t *)(data + 14); 147 partial += *(uint16_t *)(data + 16); 148 partial += *(uint16_t *)(data + 18); 149 partial += *(uint16_t *)(data + 20); 150 partial += *(uint16_t *)(data + 22); 151 partial += *(uint16_t *)(data + 24); 152 partial += *(uint16_t *)(data + 26); 153 partial += *(uint16_t *)(data + 28); 154 partial += *(uint16_t *)(data + 30); 155 data += 32; 156 mlen -= 32; 157 if (__predict_false(partial & 0xc0000000)) { 158 if (needs_swap) 159 partial = (partial << 8) + (partial >> 24); 160 sum += (partial >> 16); 161 sum += (partial & 0xffff); 162 partial = 0; 163 } 164 } 165 if (mlen & 16) { 166 partial += *(uint16_t *)data; 167 partial += *(uint16_t *)(data + 2); 168 partial += *(uint16_t *)(data + 4); 169 partial += *(uint16_t *)(data + 6); 170 partial += *(uint16_t *)(data + 8); 171 partial += *(uint16_t *)(data + 10); 172 partial += *(uint16_t *)(data + 12); 173 partial += *(uint16_t *)(data + 14); 174 data += 16; 175 mlen -= 16; 176 } 177 /* 178 * mlen is not updated below as the remaining tests 179 * are using bit masks, which are not affected. 180 */ 181 if (mlen & 8) { 182 partial += *(uint16_t *)data; 183 partial += *(uint16_t *)(data + 2); 184 partial += *(uint16_t *)(data + 4); 185 partial += *(uint16_t *)(data + 6); 186 data += 8; 187 } 188 if (mlen & 4) { 189 partial += *(uint16_t *)data; 190 partial += *(uint16_t *)(data + 2); 191 data += 4; 192 } 193 if (mlen & 2) { 194 partial += *(uint16_t *)data; 195 data += 2; 196 } 197 if (mlen & 1) { 198 #if _BYTE_ORDER == _LITTLE_ENDIAN 199 partial += *data; 200 #else 201 partial += *data << 8; 202 #endif 203 started_on_odd = !started_on_odd; 204 } 205 206 if (needs_swap) 207 partial = (partial << 8) + (partial >> 24); 208 sum += (partial >> 16) + (partial & 0xffff); 209 /* 210 * Reduce sum to allow potential byte swap 211 * in the next iteration without carry. 212 */ 213 sum = (sum >> 16) + (sum & 0xffff); 214 } 215 final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff); 216 final_acc = (final_acc >> 16) + (final_acc & 0xffff); 217 return ~final_acc & 0xffff; 218 } 219 220 #else 221 /* 64bit version */ 222 int 223 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) 224 { 225 int mlen; 226 uint64_t sum, partial; 227 unsigned int final_acc; 228 uint8_t *data; 229 bool needs_swap, started_on_odd; 230 231 KASSERT(len >= 0); 232 KASSERT(off >= 0); 233 234 needs_swap = false; 235 started_on_odd = false; 236 sum = initial_sum; 237 238 for (;;) { 239 if (__predict_false(m == NULL)) { 240 printf("in_cksum: out of data\n"); 241 return -1; 242 } 243 mlen = m->m_len; 244 if (mlen > off) { 245 mlen -= off; 246 data = mtod(m, uint8_t *) + off; 247 goto post_initial_offset; 248 } 249 off -= mlen; 250 if (len == 0) 251 break; 252 m = m->m_next; 253 } 254 255 for (; len > 0; m = m->m_next) { 256 if (__predict_false(m == NULL)) { 257 printf("in_cksum: out of data\n"); 258 return -1; 259 } 260 mlen = m->m_len; 261 data = mtod(m, uint8_t *); 262 post_initial_offset: 263 if (mlen == 0) 264 continue; 265 if (mlen > len) 266 mlen = len; 267 len -= mlen; 268 269 partial = 0; 270 if ((uintptr_t)data & 1) { 271 /* Align on word boundary */ 272 started_on_odd = !started_on_odd; 273 #if _BYTE_ORDER == _LITTLE_ENDIAN 274 partial = *data << 8; 275 #else 276 partial = *data; 277 #endif 278 ++data; 279 --mlen; 280 } 281 needs_swap = started_on_odd; 282 if ((uintptr_t)data & 2) { 283 if (mlen < 2) 284 goto trailing_bytes; 285 partial += *(uint16_t *)data; 286 data += 2; 287 mlen -= 2; 288 } 289 while (mlen >= 64) { 290 __builtin_prefetch(data + 32); 291 __builtin_prefetch(data + 64); 292 partial += *(uint32_t *)data; 293 partial += *(uint32_t *)(data + 4); 294 partial += *(uint32_t *)(data + 8); 295 partial += *(uint32_t *)(data + 12); 296 partial += *(uint32_t *)(data + 16); 297 partial += *(uint32_t *)(data + 20); 298 partial += *(uint32_t *)(data + 24); 299 partial += *(uint32_t *)(data + 28); 300 partial += *(uint32_t *)(data + 32); 301 partial += *(uint32_t *)(data + 36); 302 partial += *(uint32_t *)(data + 40); 303 partial += *(uint32_t *)(data + 44); 304 partial += *(uint32_t *)(data + 48); 305 partial += *(uint32_t *)(data + 52); 306 partial += *(uint32_t *)(data + 56); 307 partial += *(uint32_t *)(data + 60); 308 data += 64; 309 mlen -= 64; 310 if (__predict_false(partial & (3ULL << 62))) { 311 if (needs_swap) 312 partial = (partial << 8) + (partial >> 56); 313 sum += (partial >> 32); 314 sum += (partial & 0xffffffff); 315 partial = 0; 316 } 317 } 318 /* 319 * mlen is not updated below as the remaining tests 320 * are using bit masks, which are not affected. 321 */ 322 if (mlen & 32) { 323 partial += *(uint32_t *)data; 324 partial += *(uint32_t *)(data + 4); 325 partial += *(uint32_t *)(data + 8); 326 partial += *(uint32_t *)(data + 12); 327 partial += *(uint32_t *)(data + 16); 328 partial += *(uint32_t *)(data + 20); 329 partial += *(uint32_t *)(data + 24); 330 partial += *(uint32_t *)(data + 28); 331 data += 32; 332 } 333 if (mlen & 16) { 334 partial += *(uint32_t *)data; 335 partial += *(uint32_t *)(data + 4); 336 partial += *(uint32_t *)(data + 8); 337 partial += *(uint32_t *)(data + 12); 338 data += 16; 339 } 340 if (mlen & 8) { 341 partial += *(uint32_t *)data; 342 partial += *(uint32_t *)(data + 4); 343 data += 8; 344 } 345 if (mlen & 4) { 346 partial += *(uint32_t *)data; 347 data += 4; 348 } 349 if (mlen & 2) { 350 partial += *(uint16_t *)data; 351 data += 2; 352 } 353 trailing_bytes: 354 if (mlen & 1) { 355 #if _BYTE_ORDER == _LITTLE_ENDIAN 356 partial += *data; 357 #else 358 partial += *data << 8; 359 #endif 360 started_on_odd = !started_on_odd; 361 } 362 363 if (needs_swap) 364 partial = (partial << 8) + (partial >> 56); 365 sum += (partial >> 32) + (partial & 0xffffffff); 366 /* 367 * Reduce sum to allow potential byte swap 368 * in the next iteration without carry. 369 */ 370 sum = (sum >> 32) + (sum & 0xffffffff); 371 } 372 final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + 373 ((sum >> 16) & 0xffff) + (sum & 0xffff); 374 final_acc = (final_acc >> 16) + (final_acc & 0xffff); 375 final_acc = (final_acc >> 16) + (final_acc & 0xffff); 376 return ~final_acc & 0xffff; 377 } 378 #endif 379