1 /* $NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $ */ 2 /*- 3 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $"); 33 34 #include <sys/param.h> 35 #include <sys/endian.h> 36 #include <sys/mbuf.h> 37 #ifdef _KERNEL 38 #include <sys/systm.h> 39 #else 40 #include <assert.h> 41 #include <stdbool.h> 42 #include <stdio.h> 43 44 #define KASSERT(x) assert(x) 45 #endif 46 47 #include <machine/limits.h> 48 49 #include <netinet/in.h> 50 51 #ifndef _KERNEL 52 int cpu_in_cksum(struct mbuf*, int, int, uint32_t); 53 #endif 54 55 /* 56 * Checksum routine for Internet Protocol family headers (Portable Version). 57 * 58 * This routine is very heavily used in the network 59 * code and should be modified for each CPU to be as fast as possible. 60 * 61 * A discussion of different implementation techniques can be found in 62 * RFC 1071. 63 * 64 * The default implementation for 32bit architectures is using 65 * a 32bit accumulator and operating on 16bit operands. 66 * 67 * The default implementation for 64bit architectures is using 68 * a 64bit accumulator and operating on 32bit operands. 69 * 70 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core 71 * of the inner loop. After each iteration of the inner loop, a partial 72 * reduction is done to avoid carry in long packets. 73 */ 74 75 #if ULONG_MAX == 0xffffffffUL 76 /* 32bit version */ 77 int 78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) 79 { 80 int mlen; 81 uint32_t sum, partial; 82 unsigned int final_acc; 83 uint8_t *data; 84 bool needs_swap, started_on_odd; 85 86 KASSERT(len >= 0); 87 KASSERT(off >= 0); 88 89 needs_swap = false; 90 started_on_odd = false; 91 sum = (initial_sum >> 16) + (initial_sum & 0xffff); 92 93 for (;;) { 94 if (__predict_false(m == NULL)) { 95 printf("in_cksum: out of data\n"); 96 return -1; 97 } 98 mlen = m->m_len; 99 if (mlen > off) { 100 mlen -= off; 101 data = mtod(m, uint8_t *) + off; 102 goto post_initial_offset; 103 } 104 off -= mlen; 105 if (len == 0) 106 break; 107 m = m->m_next; 108 } 109 110 for (; len > 0; m = m->m_next) { 111 if (__predict_false(m == NULL)) { 112 printf("in_cksum: out of data\n"); 113 return -1; 114 } 115 mlen = m->m_len; 116 data = mtod(m, uint8_t *); 117 post_initial_offset: 118 if (mlen == 0) 119 continue; 120 if (mlen > len) 121 mlen = len; 122 len -= mlen; 123 124 partial = 0; 125 if ((uintptr_t)data & 1) { 126 /* Align on word boundary */ 127 started_on_odd = !started_on_odd; 128 #if _BYTE_ORDER == _LITTLE_ENDIAN 129 partial = *data << 8; 130 #else 131 partial = *data; 132 #endif 133 ++data; 134 --mlen; 135 } 136 needs_swap = started_on_odd; 137 while (mlen >= 32) { 138 __builtin_prefetch(data + 32); 139 partial += *(uint16_t *)data; 140 partial += *(uint16_t *)(data + 2); 141 partial += *(uint16_t *)(data + 4); 142 partial += *(uint16_t *)(data + 6); 143 partial += *(uint16_t *)(data + 8); 144 partial += *(uint16_t *)(data + 10); 145 partial += *(uint16_t *)(data + 12); 146 partial += *(uint16_t *)(data + 14); 147 partial += *(uint16_t *)(data + 16); 148 partial += *(uint16_t *)(data + 18); 149 partial += *(uint16_t *)(data + 20); 150 partial += *(uint16_t *)(data + 22); 151 partial += *(uint16_t *)(data + 24); 152 partial += *(uint16_t *)(data + 26); 153 partial += *(uint16_t *)(data + 28); 154 partial += *(uint16_t *)(data + 30); 155 data += 32; 156 mlen -= 32; 157 if (__predict_false(partial & 0xc0000000)) { 158 if (needs_swap) 159 partial = (partial << 8) + (partial >> 24); 160 sum += (partial >> 16); 161 sum += (partial & 0xffff); 162 partial = 0; 163 } 164 } 165 /* 166 * mlen is not updated below as the remaining tests 167 * are using bit masks, which are not affected. 168 */ 169 if (mlen & 16) { 170 partial += *(uint16_t *)data; 171 partial += *(uint16_t *)(data + 2); 172 partial += *(uint16_t *)(data + 4); 173 partial += *(uint16_t *)(data + 6); 174 partial += *(uint16_t *)(data + 8); 175 partial += *(uint16_t *)(data + 10); 176 partial += *(uint16_t *)(data + 12); 177 partial += *(uint16_t *)(data + 14); 178 data += 16; 179 } 180 if (mlen & 8) { 181 partial += *(uint16_t *)data; 182 partial += *(uint16_t *)(data + 2); 183 partial += *(uint16_t *)(data + 4); 184 partial += *(uint16_t *)(data + 6); 185 data += 8; 186 } 187 if (mlen & 4) { 188 partial += *(uint16_t *)data; 189 partial += *(uint16_t *)(data + 2); 190 data += 4; 191 } 192 if (mlen & 2) { 193 partial += *(uint16_t *)data; 194 data += 2; 195 } 196 if (mlen & 1) { 197 #if _BYTE_ORDER == _LITTLE_ENDIAN 198 partial += *data; 199 #else 200 partial += *data << 8; 201 #endif 202 started_on_odd = !started_on_odd; 203 } 204 205 if (needs_swap) 206 partial = (partial << 8) + (partial >> 24); 207 sum += (partial >> 16) + (partial & 0xffff); 208 /* 209 * Reduce sum to allow potential byte swap 210 * in the next iteration without carry. 211 */ 212 sum = (sum >> 16) + (sum & 0xffff); 213 } 214 final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff); 215 final_acc = (final_acc >> 16) + (final_acc & 0xffff); 216 return ~final_acc & 0xffff; 217 } 218 219 #else 220 /* 64bit version */ 221 int 222 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) 223 { 224 int mlen; 225 uint64_t sum, partial; 226 unsigned int final_acc; 227 uint8_t *data; 228 bool needs_swap, started_on_odd; 229 230 KASSERT(len >= 0); 231 KASSERT(off >= 0); 232 233 needs_swap = false; 234 started_on_odd = false; 235 sum = initial_sum; 236 237 for (;;) { 238 if (__predict_false(m == NULL)) { 239 printf("in_cksum: out of data\n"); 240 return -1; 241 } 242 mlen = m->m_len; 243 if (mlen > off) { 244 mlen -= off; 245 data = mtod(m, uint8_t *) + off; 246 goto post_initial_offset; 247 } 248 off -= mlen; 249 if (len == 0) 250 break; 251 m = m->m_next; 252 } 253 254 for (; len > 0; m = m->m_next) { 255 if (__predict_false(m == NULL)) { 256 printf("in_cksum: out of data\n"); 257 return -1; 258 } 259 mlen = m->m_len; 260 data = mtod(m, uint8_t *); 261 post_initial_offset: 262 if (mlen == 0) 263 continue; 264 if (mlen > len) 265 mlen = len; 266 len -= mlen; 267 268 partial = 0; 269 if ((uintptr_t)data & 1) { 270 /* Align on word boundary */ 271 started_on_odd = !started_on_odd; 272 #if _BYTE_ORDER == _LITTLE_ENDIAN 273 partial = *data << 8; 274 #else 275 partial = *data; 276 #endif 277 ++data; 278 --mlen; 279 } 280 needs_swap = started_on_odd; 281 if ((uintptr_t)data & 2) { 282 if (mlen < 2) 283 goto trailing_bytes; 284 partial += *(uint16_t *)data; 285 data += 2; 286 mlen -= 2; 287 } 288 while (mlen >= 64) { 289 __builtin_prefetch(data + 32); 290 __builtin_prefetch(data + 64); 291 partial += *(uint32_t *)data; 292 partial += *(uint32_t *)(data + 4); 293 partial += *(uint32_t *)(data + 8); 294 partial += *(uint32_t *)(data + 12); 295 partial += *(uint32_t *)(data + 16); 296 partial += *(uint32_t *)(data + 20); 297 partial += *(uint32_t *)(data + 24); 298 partial += *(uint32_t *)(data + 28); 299 partial += *(uint32_t *)(data + 32); 300 partial += *(uint32_t *)(data + 36); 301 partial += *(uint32_t *)(data + 40); 302 partial += *(uint32_t *)(data + 44); 303 partial += *(uint32_t *)(data + 48); 304 partial += *(uint32_t *)(data + 52); 305 partial += *(uint32_t *)(data + 56); 306 partial += *(uint32_t *)(data + 60); 307 data += 64; 308 mlen -= 64; 309 if (__predict_false(partial & (3ULL << 62))) { 310 if (needs_swap) 311 partial = (partial << 8) + (partial >> 56); 312 sum += (partial >> 32); 313 sum += (partial & 0xffffffff); 314 partial = 0; 315 } 316 } 317 /* 318 * mlen is not updated below as the remaining tests 319 * are using bit masks, which are not affected. 320 */ 321 if (mlen & 32) { 322 partial += *(uint32_t *)data; 323 partial += *(uint32_t *)(data + 4); 324 partial += *(uint32_t *)(data + 8); 325 partial += *(uint32_t *)(data + 12); 326 partial += *(uint32_t *)(data + 16); 327 partial += *(uint32_t *)(data + 20); 328 partial += *(uint32_t *)(data + 24); 329 partial += *(uint32_t *)(data + 28); 330 data += 32; 331 } 332 if (mlen & 16) { 333 partial += *(uint32_t *)data; 334 partial += *(uint32_t *)(data + 4); 335 partial += *(uint32_t *)(data + 8); 336 partial += *(uint32_t *)(data + 12); 337 data += 16; 338 } 339 if (mlen & 8) { 340 partial += *(uint32_t *)data; 341 partial += *(uint32_t *)(data + 4); 342 data += 8; 343 } 344 if (mlen & 4) { 345 partial += *(uint32_t *)data; 346 data += 4; 347 } 348 if (mlen & 2) { 349 partial += *(uint16_t *)data; 350 data += 2; 351 } 352 trailing_bytes: 353 if (mlen & 1) { 354 #if _BYTE_ORDER == _LITTLE_ENDIAN 355 partial += *data; 356 #else 357 partial += *data << 8; 358 #endif 359 started_on_odd = !started_on_odd; 360 } 361 362 if (needs_swap) 363 partial = (partial << 8) + (partial >> 56); 364 sum += (partial >> 32) + (partial & 0xffffffff); 365 /* 366 * Reduce sum to allow potential byte swap 367 * in the next iteration without carry. 368 */ 369 sum = (sum >> 32) + (sum & 0xffffffff); 370 } 371 final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + 372 ((sum >> 16) & 0xffff) + (sum & 0xffff); 373 final_acc = (final_acc >> 16) + (final_acc & 0xffff); 374 final_acc = (final_acc >> 16) + (final_acc & 0xffff); 375 return ~final_acc & 0xffff; 376 } 377 #endif 378