xref: /netbsd-src/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S (revision 4b1503161f5ac27c145f52575e9a2e2311a5d9ad)
1/*-
2 * Copyright (c) 2012 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Matt Thomas of 3am Software Foundry.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include <machine/asm.h>
31
32RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.4 2021/10/02 20:52:09 skrll Exp $")
33
34/*
35 * uint32_t
36 * cpu_in_cksum_neon(const void *dptr, size_t dlen)
37 *
38 *	r0 = dptr
39 *	r1 = dlen
40 */
41ENTRY(cpu_in_cksum_neon)
42	mov		ip, r0		/* leave r0 as temp */
43	add		r3, r1, ip	/* get end pointer */
44	and		r1, ip, #7	/* get start offset (leading btyes) */
45	and		r2, r3, #7	/* get end offset (trailing bytes) */
46	bic		ip, ip, #7	/* start on a dword boundary */
47	add		r3, r3, #7	/* round up to a dword boundary */
48	bic		r3, r3, #7	/* end on a dword boundary */
49	veor		q2, q2, q2	/* clear accumulator */
50	vmvn.u64	q1, q2		/* create leading/trailing masks */
51	/*
52	 * Normally the lower addressed is in d6 but in this case we want to
53	 * reverse it since we might only have a single dword and the final
54	 * fold will want the dword to trim in d7 so put the first dword in
55	 * d7 until we know we are going to read more than one.
56	 */
57	veor		d6, d6, d6	/* clear second dword */
58	vld1.64		{d7}, [ip:64]!	/* load first dword */
59	orrs		r0, r1, r2	/* do we have any offsets */
60	beq		.Lpre_main_loop	/*   no, proceed to main loop. */
61	mov		r1, r1, lsl #3	/* leading bytes -> bits */
62	movs		r2, r2, lsl #3	/* trailing bytes -> bits */
63#ifdef __ARMEL__
64	subne		r2, r2, #64	/* trim trailing MSBs */
65#else
66	rsb		r1, r1, #0	/* trim leading MSBs */
67	rsbne		r2, r2, #64	/* trim trailing LSBs */
68#endif
69	vmov		d0, r1, r2	/* move shifts */
70	vmovl.u32	q0, d0		/* 2 U32 -> 2 U64 */
71	vshl.u64	q1, q1, q0	/* apply shifts to masks */
72	vand.u32	d7, d7, d2	/* apply leading mask to 1st dword */
73	tst		r1, #8		/* was the starting address odd? */
74	beq		.Lpre_main_loop	/*   no, go to pre_main_loop */
75	veor		d2, d2, d2	/* clear d2 (indicate odd addr) */
76
77.Lpre_main_loop:
78	cmp		ip, r3		/* do we just have a single dword? */
79	beq		.Lfinish_up	/*   yes, let finish up! */
80	vmov		d6, d7		/* move 1st dword to loaddr reg */
81	vld1.64		{d7}, [ip:64]!	/* read rest of initial qword */
82
83.Lmain_loop:
84	subs		r1, r3, ip	/* how much left to do? */
85	beq		.Lfinish_up	/*   = 0? we are done. */
86
87	bics		r0, r1, #31	/* we deal with octawords only */
88	beq		.Lloop_end	/*   no octawords? exit loop */
89	rsbs		r0, r0, #128	/* subtract from 128 */
90	ble		.Lloop128	/*   <= 0?, do 128 at a time. */
91	add		r0, r0, r0, lsr #2 /* multiple by 1.25 */
92	add		pc, pc, r0	/* and jump! */
93	nop
94
95.Lloop128:
96	vld1.64		{d8-d9}, [ip:64]!	/* 128 left */
97	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
98	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
99	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
100	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
101	vld1.64		{d6-d7}, [ip:64]!
102	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
103	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
104	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
105	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
106
107	vld1.64		{d8-d9}, [ip:64]!	/* 96 left */
108	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
109	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
110	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
111	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
112	vld1.64		{d6-d7}, [ip:64]!
113	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
114	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
115	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
116	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
117
118	vld1.64		{d8-d9}, [ip:64]!	/* 64 left */
119	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
120	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
121	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
122	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
123	vld1.64		{d6-d7}, [ip:64]!
124	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
125	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
126	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
127	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
128
129	vld1.64		{d8-d9}, [ip:64]!	/* 32 left */
130	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
131	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
132	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
133	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
134	vld1.64		{d6-d7}, [ip:64]!
135	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
136	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
137	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
138	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
139
140	b		.Lmain_loop
141
142.Lloop_end:
143	/*
144	 * We have one to 3 more dwords to process
145	 */
146	rsb		r0, r1, #24
147	add		r0, r0, r0, lsr #1
148	add		pc, pc, r0
149	nop
150	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
151	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
152	vld1.64		{d6}, [ip:64]!
153	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
154	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
155	vld1.64		{d6}, [ip:64]!
156	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
157	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
158	vld1.64		{d7}, [ip:64]!
159
160.Lfinish_up:
161	/*
162	 * Apply remaining data in d6 and d7
163	 */
164	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
165	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
166	vand		d7, d7, d3	/* apply trailing mask */
167	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
168	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
169
170	/*
171	 * We now have 4 32-bit sums in q2 (each is 20-bits or less).
172	 * Now to get to 1 I32 bit sum.
173	 */
174	vadd.u32	d4, d4, d5	/* 4 I32 -> 2 I32 */
175	vmov		r2, s4		/* get flag for odd start */
176	teq		r2, #0		/* was start addr even? */
177	vmov		r0, r1, d4	/* extract two I32 */
178	rev16eq		r0, r0		/* byte swap if start was odd */
179	rev16eq		r1, r1		/* byte swap if start was odd */
180	adds		ip, r0, r1	/* add them producing carry */
181#include "arm/arm/cpu_in_cksum_fold.S"
182END(cpu_in_cksum_neon)
183