1/* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */ 2 3/* 4 * Copyright 2003 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Steve C. Woodford for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38/* 39 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale 40 */ 41 42#include <machine/asm.h> 43RCSID("$NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $") 44 45#include "assym.h" 46 47/* 48 * int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) 49 * 50 * Entry: 51 * r0 m 52 * r1 len 53 * r2 off 54 * r3 initial_sum 55 * 56 * Function wide register usage 57 * r8 accumulated sum 58 * r9 remaining length to parse 59 * ip pointer to next mbuf 60 */ 61/* LINTSTUB: Func: int cpu_in_cksum(struct mbuf *, int, int, uint32_t) */ 62ENTRY(cpu_in_cksum) 63 stmfd sp!, {r4-r11,lr} 64 65 mov r8, r3 /* Accumulate sum in r8 */ 66 mov r9, r1 /* save len in r9 */ 67 mov ip, r0 /* set ip to the current mbuf */ 68 69.Lin_cksum_skip_loop: 70 ldr r1, [ip, #(M_LEN)] 71 ldr r0, [ip, #(M_DATA)] 72 ldr ip, [ip, #(M_NEXT)] 73.Lin_cksum_skip_entry: 74 subs r2, r2, r1 /* offset = offset - mbuf length */ 75 blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */ 76 cmp ip, #0x00 77 bne .Lin_cksum_skip_loop 78 b .Lin_cksum_whoops 79 80.Lin_cksum_skip_done: 81 add r0, r2, r0 /* data += offset (offset is < 0) */ 82 add r0, r0, r1 /* data += length of mbuf */ 83 /* data == start of data to cksum */ 84 rsb r1, r2, #0x00 /* length = remainder of mbuf to read */ 85 mov r10, #0x00 86 b .Lin_cksum_entry 87 88.Lin_cksum_loop: 89 ldr r1, [ip, #(M_LEN)] 90 ldr r0, [ip, #(M_DATA)] 91 ldr ip, [ip, #(M_NEXT)] 92.Lin_cksum_entry: 93 cmp r9, r1 94 movlt r1, r9 95 sub r9, r9, r1 96 eor r11, r10, r0 97 add r10, r10, r1 98 adds r2, r1, #0x00 99 blne _ASM_LABEL(L_cksumdata) 100 tst r11, #0x01 101 movne r2, r2, ror #8 102 adds r8, r8, r2 103 adc r8, r8, #0x00 104 cmp ip, #00 105 bne .Lin_cksum_loop 106 107 mov r1, #0xff 108 orr r1, r1, #0xff00 109 and r0, r8, r1 110 add r0, r0, r8, lsr #16 111 add r0, r0, r0, lsr #16 112 and r0, r0, r1 113 eor r0, r0, r1 114 ldmfd sp!, {r4-r11, pc} 115 116.Lin_cksum_whoops: 117 adr r0, .Lin_cksum_whoops_str 118 bl _C_LABEL(panic) 119.Lin_cksum_whoops_str: 120 .asciz "in_cksum: out of mbufs\n" 121 .align 5 122 123 124/* 125 * The main in*_cksum() workhorse... 126 * 127 * Entry parameters: 128 * r0 Pointer to buffer 129 * r1 Buffer length 130 * lr Return address 131 * 132 * Returns: 133 * r2 Accumulated 32-bit sum 134 * 135 * Clobbers: 136 * r0-r7 137 */ 138/* LINTSTUB: Ignore */ 139ASENTRY_NP(L_cksumdata) 140#ifdef __PROG26 141 str lr, [sp, #-4]! /* for SVC26 mode */ 142#endif 143#ifdef __XSCALE__ 144 pld [r0] /* Pre-fetch the start of the buffer */ 145#endif 146 mov r2, #0 147 148 /* We first have to word-align the buffer. */ 149 ands r7, r0, #0x03 150 beq .Lcksumdata_wordaligned 151 rsb r7, r7, #0x04 152 cmp r1, r7 /* Enough bytes left to make it? */ 153 blt .Lcksumdata_endgame 154 cmp r7, #0x02 155 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 156 ldrgeb r5, [r0], #0x01 /* Fetch 2nd byte */ 157 movlt r5, #0x00 158 ldrgtb r6, [r0], #0x01 /* Fetch 3rd byte */ 159 movle r6, #0x00 160 /* Combine the three bytes depending on endianness and alignment */ 161#ifdef __ARMEB__ 162 orreq r2, r5, r4, lsl #8 163 orreq r2, r2, r6, lsl #24 164 orrne r2, r4, r5, lsl #8 165 orrne r2, r2, r6, lsl #16 166#else 167 orreq r2, r4, r5, lsl #8 168 orreq r2, r2, r6, lsl #16 169 orrne r2, r5, r4, lsl #8 170 orrne r2, r2, r6, lsl #24 171#endif 172 subs r1, r1, r7 /* Update length */ 173#ifdef __PROG26 174 ldreq pc, [sp], #4 /* All done? */ 175#else 176 moveq pc, lr /* All done? */ 177#endif 178 179 /* Buffer is now word aligned */ 180.Lcksumdata_wordaligned: 181#ifdef __XSCALE__ 182 cmp r1, #0x04 /* Less than 4 bytes left? */ 183 blt .Lcksumdata_endgame /* Yup */ 184 185 /* Now quad-align, if necessary */ 186 ands r7, r0, #0x04 187 ldrne r7, [r0], #0x04 188 subne r1, r1, #0x04 189 subs r1, r1, #0x40 190 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ 191 192 /* 193 * Buffer is now quad aligned. Sum 64 bytes at a time. 194 * Note: First ldrd is hoisted above the loop, together with 195 * setting r6 to zero to avoid stalling for results in the 196 * loop. (r7 is live, from above). 197 */ 198 ldrd r4, [r0], #0x08 199 mov r6, #0x00 200.Lcksumdata_bigloop: 201 pld [r0, #0x18] 202 adds r2, r2, r6 203 adcs r2, r2, r7 204 ldrd r6, [r0], #0x08 205 adcs r2, r2, r4 206 adcs r2, r2, r5 207 ldrd r4, [r0], #0x08 208 adcs r2, r2, r6 209 adcs r2, r2, r7 210 ldrd r6, [r0], #0x08 211 adcs r2, r2, r4 212 adcs r2, r2, r5 213 ldrd r4, [r0], #0x08 214 adcs r2, r2, r6 215 adcs r2, r2, r7 216 pld [r0, #0x18] 217 ldrd r6, [r0], #0x08 218 adcs r2, r2, r4 219 adcs r2, r2, r5 220 ldrd r4, [r0], #0x08 221 adcs r2, r2, r6 222 adcs r2, r2, r7 223 ldrd r6, [r0], #0x08 224 adcs r2, r2, r4 225 adcs r2, r2, r5 226 adc r2, r2, #0x00 227 subs r1, r1, #0x40 228 ldrged r4, [r0], #0x08 229 bge .Lcksumdata_bigloop 230 231 adds r2, r2, r6 /* r6/r7 still need summing */ 232.Lcksumdata_bigloop_end: 233 adcs r2, r2, r7 234 adc r2, r2, #0x00 235 236#else /* !__XSCALE__ */ 237 238 subs r1, r1, #0x40 239 blt .Lcksumdata_bigloop_end 240 241.Lcksumdata_bigloop: 242 ldmia r0!, {r3, r4, r5, r6} 243 adds r2, r2, r3 244 adcs r2, r2, r4 245 adcs r2, r2, r5 246 ldmia r0!, {r3, r4, r5, r7} 247 adcs r2, r2, r6 248 adcs r2, r2, r3 249 adcs r2, r2, r4 250 adcs r2, r2, r5 251 ldmia r0!, {r3, r4, r5, r6} 252 adcs r2, r2, r7 253 adcs r2, r2, r3 254 adcs r2, r2, r4 255 adcs r2, r2, r5 256 ldmia r0!, {r3, r4, r5, r7} 257 adcs r2, r2, r6 258 adcs r2, r2, r3 259 adcs r2, r2, r4 260 adcs r2, r2, r5 261 adcs r2, r2, r7 262 adc r2, r2, #0x00 263 subs r1, r1, #0x40 264 bge .Lcksumdata_bigloop 265.Lcksumdata_bigloop_end: 266#endif 267 268 adds r1, r1, #0x40 269#ifdef __PROG26 270 ldreq pc, [sp], #4 271#else 272 moveq pc, lr 273#endif 274 cmp r1, #0x20 275 276#ifdef __XSCALE__ 277 ldrged r4, [r0], #0x08 /* Avoid stalling pld and result */ 278 blt .Lcksumdata_less_than_32 279 pld [r0, #0x18] 280 ldrd r6, [r0], #0x08 281 adds r2, r2, r4 282 adcs r2, r2, r5 283 ldrd r4, [r0], #0x08 284 adcs r2, r2, r6 285 adcs r2, r2, r7 286 ldrd r6, [r0], #0x08 287 adcs r2, r2, r4 288 adcs r2, r2, r5 289 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ 290 adcs r2, r2, r7 291#else 292 blt .Lcksumdata_less_than_32 293 ldmia r0!, {r3, r4, r5, r6} 294 adds r2, r2, r3 295 adcs r2, r2, r4 296 adcs r2, r2, r5 297 ldmia r0!, {r3, r4, r5, r7} 298 adcs r2, r2, r6 299 adcs r2, r2, r3 300 adcs r2, r2, r4 301 adcs r2, r2, r5 302 adcs r2, r2, r7 303#endif 304 adc r2, r2, #0x00 305 subs r1, r1, #0x20 306#ifdef __PROG26 307 ldreq pc, [sp], #4 308#else 309 moveq pc, lr 310#endif 311 312.Lcksumdata_less_than_32: 313 /* There are less than 32 bytes left */ 314 and r3, r1, #0x18 315 rsb r4, r3, #0x18 316 sub r1, r1, r3 317 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 318 addne pc, pc, r4 319 320/* 321 * Note: We use ldm here, even on Xscale, since the combined issue/result 322 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 323 */ 324 /* At least 24 bytes remaining... */ 325 ldmia r0!, {r4, r5} 326 nop 327 adcs r2, r2, r4 328 adcs r2, r2, r5 329 330 /* At least 16 bytes remaining... */ 331 ldmia r0!, {r4, r5} 332 adcs r2, r2, r4 333 adcs r2, r2, r5 334 335 /* At least 8 bytes remaining... */ 336 ldmia r0!, {r4, r5} 337 adcs r2, r2, r4 338 adcs r2, r2, r5 339 340 /* Less than 8 bytes remaining... */ 341 adc r2, r2, #0x00 342 subs r1, r1, #0x04 343 blt .Lcksumdata_lessthan4 344 345 ldr r4, [r0], #0x04 346 sub r1, r1, #0x04 347 adds r2, r2, r4 348 adc r2, r2, #0x00 349 350 /* Deal with < 4 bytes remaining */ 351.Lcksumdata_lessthan4: 352 adds r1, r1, #0x04 353#ifdef __PROG26 354 ldreq pc, [sp], #4 355#else 356 moveq pc, lr 357#endif 358 359 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 360.Lcksumdata_endgame: 361 ldrb r3, [r0] /* Fetch first byte */ 362 cmp r1, #0x02 363 ldrgeb r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 364 movlt r4, #0x00 365 ldrgtb r5, [r0, #0x02] 366 movle r5, #0x00 367 /* Combine the three bytes depending on endianness and alignment */ 368 tst r0, #0x01 369#ifdef __ARMEB__ 370 orreq r3, r4, r3, lsl #8 371 orreq r3, r3, r5, lsl #24 372 orrne r3, r3, r4, lsl #8 373 orrne r3, r3, r5, lsl #16 374#else 375 orreq r3, r3, r4, lsl #8 376 orreq r3, r3, r5, lsl #16 377 orrne r3, r4, r3, lsl #8 378 orrne r3, r3, r5, lsl #24 379#endif 380 adds r2, r2, r3 381 adc r2, r2, #0x00 382#ifdef __PROG26 383 ldr pc, [sp], #4 384#else 385 mov pc, lr 386#endif 387