1/* $NetBSD: cpu_in_cksum.S,v 1.8 2013/12/22 16:29:42 matt Exp $ */ 2 3/* 4 * Copyright 2003 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Steve C. Woodford for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38/* 39 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale 40 */ 41 42#include <machine/asm.h> 43RCSID("$NetBSD: cpu_in_cksum.S,v 1.8 2013/12/22 16:29:42 matt Exp $") 44 45#include "assym.h" 46 47/* 48 * int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) 49 * 50 * Entry: 51 * r0 m 52 * r1 len 53 * r2 off 54 * r3 initial_sum 55 * 56 * Function wide register usage 57 * r8 accumulated sum 58 * r9 remaining length to parse 59 * ip pointer to next mbuf 60 */ 61/* LINTSTUB: Func: int cpu_in_cksum(struct mbuf *, int, int, uint32_t) */ 62ENTRY(cpu_in_cksum) 63 push {r4-r11,lr} 64 65 mov r8, r3 /* Accumulate sum in r8 */ 66 mov r9, r1 /* save len in r9 */ 67 mov ip, r0 /* set ip to the current mbuf */ 68 69.Lin_cksum_skip_loop: 70 ldr r1, [ip, #(M_LEN)] 71 ldr r0, [ip, #(M_DATA)] 72 ldr ip, [ip, #(M_NEXT)] 73.Lin_cksum_skip_entry: 74 subs r2, r2, r1 /* offset = offset - mbuf length */ 75 blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */ 76 cmp ip, #0x00 77 bne .Lin_cksum_skip_loop 78 b .Lin_cksum_whoops 79 80.Lin_cksum_skip_done: 81 add r0, r2, r0 /* data += offset (offset is < 0) */ 82 add r0, r0, r1 /* data += length of mbuf */ 83 /* data == start of data to cksum */ 84 rsb r1, r2, #0x00 /* length = remainder of mbuf to read */ 85 mov r10, #0x00 86 b .Lin_cksum_entry 87 88.Lin_cksum_loop: 89 ldr r1, [ip, #(M_LEN)] 90 ldr r0, [ip, #(M_DATA)] 91 ldr ip, [ip, #(M_NEXT)] 92.Lin_cksum_entry: 93 cmp r9, r1 94#ifdef __thumb__ 95 bge 1f 96 mov r1, r9 97#else 98 movlt r1, r9 99#endif 1001: sub r9, r9, r1 101 eor r11, r10, r0 102 add r10, r10, r1 103 adds r2, r1, #0x00 104#ifdef __thumb__ 105 it ne 106#endif 107 blne _ASM_LABEL(arm_cksumdata) 108 tst r11, #0x01 109#ifdef __thumb__ 110 it ne 111#endif 112 movne r2, r2, ror #8 113 adds r8, r8, r2 114 adc r8, r8, #0x00 115 cmp ip, #00 116 bne .Lin_cksum_loop 117 118#ifdef __thumb__ 119 mov r0, r8 120 lsls r2, r0, #16 121 adds r0, r0, r2 122 bcc 1f 123 adds r0, r0, #65536 1241: mvns r0, r0 125 lsrs r0, r0, #16 126#else 127 adds r8, r8, r8, lsl #16 128 addcs r8, r8, #65536 129 mvn r0, r8 130 lsr r0, r0, #16 131#endif 132 pop {r4-r11, pc} 133 134.Lin_cksum_whoops: 135 adr r0, .Lin_cksum_whoops_str 136 bl _C_LABEL(panic) 137.Lin_cksum_whoops_str: 138 .asciz "in_cksum: out of mbufs\n" 139 .p2align 5 140END(cpu_in_cksum) 141 142 143/* 144 * The main in*_cksum() workhorse... 145 * 146 * Entry parameters: 147 * r0 Pointer to buffer 148 * r1 Buffer length 149 * lr Return address 150 * 151 * Returns: 152 * r2 Accumulated 32-bit sum 153 * 154 * Clobbers: 155 * r0-r7 156 */ 157/* LINTSTUB: Ignore */ 158ASENTRY_NP(arm_cksumdata) 159#ifdef __PROG26 160 str lr, [sp, #-4]! /* for SVC26 mode */ 161#endif 162#ifdef __XSCALE__ 163 pld [r0] /* Pre-fetch the start of the buffer */ 164#endif 165 movs r2, #0 166 167 /* We first have to word-align the buffer. */ 168 ands r7, r0, #0x03 169 beq .Lcksumdata_wordaligned 170 eors r0, r0, r7 /* r0 is word aligned */ 171 ldr r2, [r0], #0x04 172#ifdef __thumb__ 173 movs r4, r7 174 lsls r4, r4, #3 175#else 176 lsl r4, r7, #3 177#endif 178#if defined(__ARMEB__) 179 lsls r2, r2, r4 180 lsrs r2, r2, r4 181#else 182 lsrs r2, r2, r4 183 lsls r2, r2, r4 184#endif 185 rsb r7, r7, #0x04 186 subs r1, r1, r7 /* Enough bytes left to make it? */ 187 bgt .Lcksumdata_wordaligned 188#ifdef __PROG26 189 ldreq pc, [sp], #4 /* done */ 190#else 191 RETc(eq) /* done */ 192#endif 193 adds r7, r7, r1 /* undo sub */ 194 adds r7, r7, r1 /* r7 = offset + len */ 195 rsb r7, r7, #4 196 lsls r7, r7, #3 197#if defined(__ARMEB__) 198 lsrs r2, r2, r7 199 lsls r2, r2, r7 200#else 201 lsls r2, r2, r7 202 lsrs r2, r2, r7 203#endif 204#ifdef __PROG26 205 ldr pc, [sp], #4 /* done */ 206#else 207 RET /* done */ 208#endif 209 210 /* Buffer is now word aligned */ 211.Lcksumdata_wordaligned: 212#ifdef __XSCALE__ 213 cmp r1, #0x04 /* Less than 4 bytes left? */ 214 blt .Lcksumdata_endgame /* Yup */ 215 216 /* Now quad-align, if necessary */ 217 ands r7, r0, #0x04 218 ldrne r7, [r0], #0x04 219 subne r1, r1, #0x04 220 subs r1, r1, #0x40 221 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ 222 223 /* 224 * Buffer is now quad aligned. Sum 64 bytes at a time. 225 * Note: First ldrd is hoisted above the loop, together with 226 * setting r6 to zero to avoid stalling for results in the 227 * loop. (r7 is live, from above). 228 */ 229 ldrd r4, r5, [r0], #0x08 230 mov r6, #0x00 231.Lcksumdata_bigloop: 232 pld [r0, #0x18] 233 adds r2, r2, r6 234 adcs r2, r2, r7 235 ldrd r6, r7, [r0], #0x08 236 adcs r2, r2, r4 237 adcs r2, r2, r5 238 ldrd r4, r5, [r0], #0x08 239 adcs r2, r2, r6 240 adcs r2, r2, r7 241 ldrd r6, r7, [r0], #0x08 242 adcs r2, r2, r4 243 adcs r2, r2, r5 244 ldrd r4, r5, [r0], #0x08 245 adcs r2, r2, r6 246 adcs r2, r2, r7 247 pld [r0, #0x18] 248 ldrd r6, r7, [r0], #0x08 249 adcs r2, r2, r4 250 adcs r2, r2, r5 251 ldrd r4, r5, [r0], #0x08 252 adcs r2, r2, r6 253 adcs r2, r2, r7 254 ldrd r6, r7, [r0], #0x08 255 adcs r2, r2, r4 256 adcs r2, r2, r5 257 adcs r2, r2, #0x00 258 subs r1, r1, #0x40 259 ldrdge r4, r5, [r0], #0x08 260 bge .Lcksumdata_bigloop 261 262 adds r2, r2, r6 /* r6/r7 still need summing */ 263.Lcksumdata_bigloop_end: 264 adcs r2, r2, r7 265 adcs r2, r2, #0x00 266 267#else /* !__XSCALE__ */ 268 269 subs r1, r1, #0x40 270 blt .Lcksumdata_bigloop_end 271 272.Lcksumdata_bigloop: 273 ldmia r0!, {r3, r4, r5, r6} 274 adds r2, r2, r3 275 adcs r2, r2, r4 276 adcs r2, r2, r5 277 ldmia r0!, {r3, r4, r5, r7} 278 adcs r2, r2, r6 279 adcs r2, r2, r3 280 adcs r2, r2, r4 281 adcs r2, r2, r5 282 ldmia r0!, {r3, r4, r5, r6} 283 adcs r2, r2, r7 284 adcs r2, r2, r3 285 adcs r2, r2, r4 286 adcs r2, r2, r5 287 ldmia r0!, {r3, r4, r5, r7} 288 adcs r2, r2, r6 289 adcs r2, r2, r3 290 adcs r2, r2, r4 291 adcs r2, r2, r5 292 adcs r2, r2, r7 293 adcs r2, r2, #0x00 294 subs r1, r1, #0x40 295 bge .Lcksumdata_bigloop 296.Lcksumdata_bigloop_end: 297#endif 298 299 adds r1, r1, #0x40 300#ifdef __PROG26 301 ldreq pc, [sp], #4 302#else 303 RETc(eq) 304#endif 305 cmp r1, #0x20 306 307#ifdef __XSCALE__ 308 ldrdge r4, r5, [r0], #0x08 /* Avoid stalling pld and result */ 309 blt .Lcksumdata_less_than_32 310 pld [r0, #0x18] 311 ldrd r6, r7, [r0], #0x08 312 adds r2, r2, r4 313 adcs r2, r2, r5 314 ldrd r4, r5, [r0], #0x08 315 adcs r2, r2, r6 316 adcs r2, r2, r7 317 ldrd r6, r7, [r0], #0x08 318 adcs r2, r2, r4 319 adcs r2, r2, r5 320 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ 321 adcs r2, r2, r7 322#else 323 blt .Lcksumdata_less_than_32 324 ldmia r0!, {r3, r4, r5, r6} 325 adds r2, r2, r3 326 adcs r2, r2, r4 327 adcs r2, r2, r5 328 ldmia r0!, {r3, r4, r5, r7} 329 adcs r2, r2, r6 330 adcs r2, r2, r3 331 adcs r2, r2, r4 332 adcs r2, r2, r5 333 adcs r2, r2, r7 334#endif 335 adcs r2, r2, #0x00 336 subs r1, r1, #0x20 337#ifdef __PROG26 338 ldreq pc, [sp], #4 339#else 340 RETc(eq) 341#endif 342 343.Lcksumdata_less_than_32: 344 /* There are less than 32 bytes left */ 345 and r3, r1, #0x18 346 rsb r4, r3, #0x18 347 subs r1, r1, r3 348 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 349#ifdef __thumb__ 350 it ne 351#endif 352 addne pc, pc, r4 353 354/* 355 * Note: We use ldm here, even on Xscale, since the combined issue/result 356 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 357 */ 358 /* At least 24 bytes remaining... */ 359 ldmia r0!, {r4, r5} 360 nop 361 adcs r2, r2, r4 362 adcs r2, r2, r5 363 364 /* At least 16 bytes remaining... */ 365 ldmia r0!, {r4, r5} 366 adcs r2, r2, r4 367 adcs r2, r2, r5 368 369 /* At least 8 bytes remaining... */ 370 ldmia r0!, {r4, r5} 371 adcs r2, r2, r4 372 adcs r2, r2, r5 373 374 /* Less than 8 bytes remaining... */ 375 adcs r2, r2, #0x00 376 subs r1, r1, #0x04 377 blt .Lcksumdata_lessthan4 378 379 ldr r4, [r0], #0x04 380 subs r1, r1, #0x04 381 adds r2, r2, r4 382 adcs r2, r2, #0x00 383 384 /* Deal with < 4 bytes remaining */ 385.Lcksumdata_lessthan4: 386 adds r1, r1, #0x04 387#ifdef __PROG26 388 ldreq pc, [sp], #4 389#else 390 RETc(eq) 391#endif 392 393 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 394.Lcksumdata_endgame: 395 ldr r3, [r0] /* Fetch last word */ 396 rsb r1, r1, #4 /* get discard amount */ 397 lsl r1, r1, #3 /* turn it into bits */ 398#ifdef __ARMEB__ 399 lsr r3, r3, r1 /* discard least significant bits */ 400 lsl r3, r3, r1 /* shift back filling with zeros */ 401#else 402 lsl r3, r3, r1 /* discard least significant bits */ 403 lsr r3, r3, r1 /* shift back filling with zeros */ 404#endif 405 adds r2, r2, r3 406 adcs r2, r2, #0x00 407#ifdef __PROG26 408 ldr pc, [sp], #4 409#else 410 RET 411#endif 412ASEND(arm_cksumdata) 413