xref: /isa-l/raid/aarch64/xor_gen_neon.S (revision 5a00eaec3325e6bc681424fe66b4680400bca540)
1########################################################################
2#  Copyright(c) 2019 Arm Corporation All rights reserved.
3#
4#  Redistribution and use in source and binary forms, with or without
5#  modification, are permitted provided that the following conditions
6#  are met:
7#    * Redistributions of source code must retain the above copyright
8#      notice, this list of conditions and the following disclaimer.
9#    * Redistributions in binary form must reproduce the above copyright
10#      notice, this list of conditions and the following disclaimer in
11#      the documentation and/or other materials provided with the
12#      distribution.
13#    * Neither the name of Arm Corporation nor the names of its
14#      contributors may be used to endorse or promote products derived
15#      from this software without specific prior written permission.
16#
17#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#########################################################################
29
30#include "../include/aarch64_label.h"
31
32.text
33
34.global cdecl(xor_gen_neon)
35#ifndef __APPLE__
36.type xor_gen_neon, %function
37#endif
38
39/* int xor_gen_neon(int vects, int len, void **src) */
40
41/* arguments */
42w_vects		.req	w0	/* MUST >= 2 */
43x_vects		.req	x0
44w_len		.req	w1
45x_len		.req	x1
46x_src		.req	x2
47
48/* returns */
49w_ret		.req	w0
50
51/* local variables */
52w_in		.req	w1	/* share w_len */
53x_src0		.req	x3
54x_src0_end  	.req	x4
55w_len256	.req	w5	/* share w_len16, w_xor */
56x_len256	.req	x5
57w_len16		.req	w5
58x_len16		.req	x5
59w_xor		.req	w5
60w_col		.req	w6
61x_col		.req	x6
62x_src_ptr	.req	x7
63x_srcn		.req	x9
64x_dst		.req	x10
65x_dst_ptr	.req	x11
66/* v0  ~ v15: temporary results */
67/* v16 ~ v31: next 256 bytes */
68
69/*
70 *                 +----------+            +------------------+
71 *         src --> |  src[0]  | - src0 ->  |      buffer      | src0_end
72 *         --------+----------+            +------------------+
73 *           .     |  ......  |
74 *           .     +----------+            +------------------+
75 *     src_ptr ~~> |  src[n]  | - srcn ~>  |      buffer      |
76 *           .     +----------+            +------------------+
77 *           .     |  ......  |
78 *           .     +----------+
79 *           .     | src[v-2] |
80 *         --------+----------+            +------------------+
81 *     dst_ptr --> | src[v-1] | -- dst --> |      buffer      |
82 *                 +----------+            +------------------+
83 */
84
85cdecl(xor_gen_neon):
86	add	x_dst_ptr, x_src, x_vects, lsl #3
87	ldr	x_dst, [x_dst_ptr, #-8]!
88	ldr	x_src0, [x_src]
89	add	x_src0_end, x_src0, x_len
90
91	sub	w_vects, w_vects, #2
92	mov	w_col, #0
93
94.Loop256_init:
95	/* len256 = len - len%256; len %= 256 */
96	mov	w_len256, w_len
97	and	w_len, w_len, #0xFF
98	sub	w_len256, w_len256, w_len
99
100	/* less than 256 byts? */
101	cbz	w_len256, .Lloop16_init
102
103	/* save d8 ~ d15 to stack */
104	sub	sp, sp, #64
105	stp	d8, d9, [sp]
106	stp	d10, d11, [sp, #16]
107	stp	d12, d13, [sp, #32]
108	stp	d14, d15, [sp, #48]
109
110	sub	x_src0_end, x_src0_end, #256
111
112	/* batch process (vects-1)*256 bytes */
113.Lloop256:
114	ldr	q0,  [x_src0, #16*0]
115	ldr	q1,  [x_src0, #16*1]
116	ldr	q2,  [x_src0, #16*2]
117	ldr	q3,  [x_src0, #16*3]
118	ldr	q4,  [x_src0, #16*4]
119	ldr	q5,  [x_src0, #16*5]
120	ldr	q6,  [x_src0, #16*6]
121	ldr	q7,  [x_src0, #16*7]
122	ldr	q8,  [x_src0, #16*8]
123	ldr	q9,  [x_src0, #16*9]
124	ldr	q10, [x_src0, #16*10]
125	ldr	q11, [x_src0, #16*11]
126	ldr	q12, [x_src0, #16*12]
127	ldr	q13, [x_src0, #16*13]
128	ldr	q14, [x_src0, #16*14]
129	ldr	q15, [x_src0, #16*15]
130	add	x_src0, x_src0, #256
131
132	cbz	w_vects, .Lloop256_vects_end
133
134	add	x_src_ptr, x_src, #8
135.Lloop256_vects:
136	ldr	x_srcn, [x_src_ptr], #8
137	add	x_srcn, x_srcn, x_col
138	cmp	x_src_ptr, x_dst_ptr
139
140	ldr	q16, [x_srcn, #16*0]
141	ldr	q17, [x_srcn, #16*1]
142	ldr	q18, [x_srcn, #16*2]
143	ldr	q19, [x_srcn, #16*3]
144	ldr	q20, [x_srcn, #16*4]
145	ldr	q21, [x_srcn, #16*5]
146	ldr	q22, [x_srcn, #16*6]
147	ldr	q23, [x_srcn, #16*7]
148	ldr	q24, [x_srcn, #16*8]
149	ldr	q25, [x_srcn, #16*9]
150	ldr	q26, [x_srcn, #16*10]
151	ldr	q27, [x_srcn, #16*11]
152	ldr	q28, [x_srcn, #16*12]
153	ldr	q29, [x_srcn, #16*13]
154	ldr	q30, [x_srcn, #16*14]
155	ldr	q31, [x_srcn, #16*15]
156
157	eor	v0.16b,  v0.16b,  v16.16b
158	eor	v1.16b,  v1.16b,  v17.16b
159	eor	v2.16b,  v2.16b,  v18.16b
160	eor	v3.16b,  v3.16b,  v19.16b
161	eor	v4.16b,  v4.16b,  v20.16b
162	eor	v5.16b,  v5.16b,  v21.16b
163	eor	v6.16b,  v6.16b,  v22.16b
164	eor	v7.16b,  v7.16b,  v23.16b
165	eor	v8.16b,  v8.16b,  v24.16b
166	eor	v9.16b,  v9.16b,  v25.16b
167	eor	v10.16b, v10.16b, v26.16b
168	eor	v11.16b, v11.16b, v27.16b
169	eor	v12.16b, v12.16b, v28.16b
170	eor	v13.16b, v13.16b, v29.16b
171	eor	v14.16b, v14.16b, v30.16b
172	eor	v15.16b, v15.16b, v31.16b
173
174	bne	.Lloop256_vects
175
176.Lloop256_vects_end:
177	str	q0,  [x_dst, #16*0]
178	str	q1,  [x_dst, #16*1]
179	str	q2,  [x_dst, #16*2]
180	str	q3,  [x_dst, #16*3]
181	str	q4,  [x_dst, #16*4]
182	str	q5,  [x_dst, #16*5]
183	str	q6,  [x_dst, #16*6]
184	str	q7,  [x_dst, #16*7]
185	str	q8,  [x_dst, #16*8]
186	str	q9,  [x_dst, #16*9]
187	str	q10, [x_dst, #16*10]
188	str	q11, [x_dst, #16*11]
189	str	q12, [x_dst, #16*12]
190	str	q13, [x_dst, #16*13]
191	str	q14, [x_dst, #16*14]
192	str	q15, [x_dst, #16*15]
193
194	cmp	x_src0, x_src0_end
195	add	x_dst, x_dst, #256
196	add	w_col, w_col, #256
197	bls	.Lloop256
198
199.Lloop256_end:
200	/* restore d8 ~ d15 */
201	ldp	d8, d9, [sp]
202	ldp	d10, d11, [sp, #16]
203	ldp	d12, d13, [sp, #32]
204	ldp	d14, d15, [sp, #48]
205	add	sp, sp, #64
206
207	add	x_src0_end, x_src0_end, #256
208
209.Lloop16_init:
210	/* len16 = len - len%16; len %= 16 */
211	mov	w_len16, w_len
212	and	w_len, w_len, #0xF
213	sub	w_len16, w_len16, w_len
214
215	/* less than 16 bytes? */
216	cbz	w_len16, .Lloop1_init
217
218	sub	x_src0_end, x_src0_end, #16
219
220	/* batch process (vects-1)*16 bytes */
221.Lloop16:
222	ldr	q0, [x_src0], #16
223	cbz	w_vects, .Lloop16_vects_end
224
225	add	x_src_ptr, x_src, #8
226.Lloop16_vects:
227	ldr	x_srcn, [x_src_ptr], #8
228	cmp	x_src_ptr, x_dst_ptr
229	ldr	q1, [x_srcn, x_col]
230	eor	v0.16b, v0.16b, v1.16b
231	bne	.Lloop16_vects
232
233.Lloop16_vects_end:
234	cmp	x_src0, x_src0_end
235	str	q0, [x_dst], #16
236	add	w_col, w_col, #16
237	bls	.Lloop16
238
239.Loop16_end:
240	add	x_src0_end, x_src0_end, #16
241
242.Lloop1_init:
243	cbnz	w_len, .Lloop1
244	mov	w_ret, #0
245	ret
246
247	/* batch process (vects-1)*1 bytes */
248.Lloop1:
249	ldrb	w_xor, [x_src0], #1
250	cbz	w_vects, .Lloop1_vects_end
251
252	add	x_src_ptr, x_src, #8
253.Lloop1_vects:
254	ldr	x_srcn, [x_src_ptr], #8
255	cmp	x_src_ptr, x_dst_ptr
256	ldrb	w_in, [x_srcn, x_col]
257	eor	w_xor, w_xor, w_in
258	bne	.Lloop1_vects
259
260.Lloop1_vects_end:
261	cmp	x_src0, x_src0_end
262	strb	w_xor, [x_dst], #1
263	add	w_col, w_col, #1
264	bne	.Lloop1
265
266.Loop1_end:
267	mov	w_ret, #0
268	ret
269