1/*- 2 * Copyright (c) 2012 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Matt Thomas of 3am Software Foundry. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30#include <machine/asm.h> 31 32RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.4 2021/10/02 20:52:09 skrll Exp $") 33 34/* 35 * uint32_t 36 * cpu_in_cksum_neon(const void *dptr, size_t dlen) 37 * 38 * r0 = dptr 39 * r1 = dlen 40 */ 41ENTRY(cpu_in_cksum_neon) 42 mov ip, r0 /* leave r0 as temp */ 43 add r3, r1, ip /* get end pointer */ 44 and r1, ip, #7 /* get start offset (leading btyes) */ 45 and r2, r3, #7 /* get end offset (trailing bytes) */ 46 bic ip, ip, #7 /* start on a dword boundary */ 47 add r3, r3, #7 /* round up to a dword boundary */ 48 bic r3, r3, #7 /* end on a dword boundary */ 49 veor q2, q2, q2 /* clear accumulator */ 50 vmvn.u64 q1, q2 /* create leading/trailing masks */ 51 /* 52 * Normally the lower addressed is in d6 but in this case we want to 53 * reverse it since we might only have a single dword and the final 54 * fold will want the dword to trim in d7 so put the first dword in 55 * d7 until we know we are going to read more than one. 56 */ 57 veor d6, d6, d6 /* clear second dword */ 58 vld1.64 {d7}, [ip:64]! /* load first dword */ 59 orrs r0, r1, r2 /* do we have any offsets */ 60 beq .Lpre_main_loop /* no, proceed to main loop. */ 61 mov r1, r1, lsl #3 /* leading bytes -> bits */ 62 movs r2, r2, lsl #3 /* trailing bytes -> bits */ 63#ifdef __ARMEL__ 64 subne r2, r2, #64 /* trim trailing MSBs */ 65#else 66 rsb r1, r1, #0 /* trim leading MSBs */ 67 rsbne r2, r2, #64 /* trim trailing LSBs */ 68#endif 69 vmov d0, r1, r2 /* move shifts */ 70 vmovl.u32 q0, d0 /* 2 U32 -> 2 U64 */ 71 vshl.u64 q1, q1, q0 /* apply shifts to masks */ 72 vand.u32 d7, d7, d2 /* apply leading mask to 1st dword */ 73 tst r1, #8 /* was the starting address odd? */ 74 beq .Lpre_main_loop /* no, go to pre_main_loop */ 75 veor d2, d2, d2 /* clear d2 (indicate odd addr) */ 76 77.Lpre_main_loop: 78 cmp ip, r3 /* do we just have a single dword? */ 79 beq .Lfinish_up /* yes, let finish up! */ 80 vmov d6, d7 /* move 1st dword to loaddr reg */ 81 vld1.64 {d7}, [ip:64]! /* read rest of initial qword */ 82 83.Lmain_loop: 84 subs r1, r3, ip /* how much left to do? */ 85 beq .Lfinish_up /* = 0? we are done. */ 86 87 bics r0, r1, #31 /* we deal with octawords only */ 88 beq .Lloop_end /* no octawords? exit loop */ 89 rsbs r0, r0, #128 /* subtract from 128 */ 90 ble .Lloop128 /* <= 0?, do 128 at a time. */ 91 add r0, r0, r0, lsr #2 /* multiple by 1.25 */ 92 add pc, pc, r0 /* and jump! */ 93 nop 94 95.Lloop128: 96 vld1.64 {d8-d9}, [ip:64]! /* 128 left */ 97 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 98 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 99 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 100 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 101 vld1.64 {d6-d7}, [ip:64]! 102 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 103 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 104 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 105 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 106 107 vld1.64 {d8-d9}, [ip:64]! /* 96 left */ 108 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 109 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 110 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 111 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 112 vld1.64 {d6-d7}, [ip:64]! 113 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 114 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 115 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 116 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 117 118 vld1.64 {d8-d9}, [ip:64]! /* 64 left */ 119 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 120 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 121 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 122 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 123 vld1.64 {d6-d7}, [ip:64]! 124 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 125 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 126 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 127 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 128 129 vld1.64 {d8-d9}, [ip:64]! /* 32 left */ 130 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 131 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 132 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 133 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 134 vld1.64 {d6-d7}, [ip:64]! 135 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 136 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 137 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 138 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 139 140 b .Lmain_loop 141 142.Lloop_end: 143 /* 144 * We have one to 3 more dwords to process 145 */ 146 rsb r0, r1, #24 147 add r0, r0, r0, lsr #1 148 add pc, pc, r0 149 nop 150 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 151 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 152 vld1.64 {d6}, [ip:64]! 153 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 154 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 155 vld1.64 {d6}, [ip:64]! 156 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 157 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 158 vld1.64 {d7}, [ip:64]! 159 160.Lfinish_up: 161 /* 162 * Apply remaining data in d6 and d7 163 */ 164 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 165 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 166 vand d7, d7, d3 /* apply trailing mask */ 167 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 168 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 169 170 /* 171 * We now have 4 32-bit sums in q2 (each is 20-bits or less). 172 * Now to get to 1 I32 bit sum. 173 */ 174 vadd.u32 d4, d4, d5 /* 4 I32 -> 2 I32 */ 175 vmov r2, s4 /* get flag for odd start */ 176 teq r2, #0 /* was start addr even? */ 177 vmov r0, r1, d4 /* extract two I32 */ 178 rev16eq r0, r0 /* byte swap if start was odd */ 179 rev16eq r1, r1 /* byte swap if start was odd */ 180 adds ip, r0, r1 /* add them producing carry */ 181#include "arm/arm/cpu_in_cksum_fold.S" 182END(cpu_in_cksum_neon) 183