xref: /isa-l/raid/aarch64/pq_check_neon.S (revision 5be1ba2215613379677fb85cccb327627bdac6a0)
1########################################################################
2#  Copyright(c) 2019 Arm Corporation All rights reserved.
3#
4#  Redistribution and use in source and binary forms, with or without
5#  modification, are permitted provided that the following conditions
6#  are met:
7#    * Redistributions of source code must retain the above copyright
8#      notice, this list of conditions and the following disclaimer.
9#    * Redistributions in binary form must reproduce the above copyright
10#      notice, this list of conditions and the following disclaimer in
11#      the documentation and/or other materials provided with the
12#      distribution.
13#    * Neither the name of Arm Corporation nor the names of its
14#      contributors may be used to endorse or promote products derived
15#      from this software without specific prior written permission.
16#
17#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#########################################################################
29
30.text
31
32.global pq_check_neon
33.type pq_check_neon, %function
34
35/* int pq_check_neon(int vects, int len, void **src) */
36
37/* arguments */
38w_vects		.req	w0	/* MUST >= 3 */
39x_vects		.req	x0
40w_len		.req	w1	/* MUST be 16x bytes */
41x_len		.req	x1
42x_src		.req	x2
43
44/* returns */
45w_ret		.req	w0
46
47/* local variables */
48x_dst_p		.req	x3
49x_dst_q		.req	x4
50x_dst_q_end  	.req	x5
51w_col		.req	w6
52x_col		.req	x6
53x_src_ptr	.req	x7
54x_src_ptr_end	.req	x9
55x_src_last	.req	x10
56x_srcn		.req	x11
57w_min		.req	w12
58/* vectors */
59/* v0  ~ v7 : temporary p */
60/* v8  ~ v15: temporary q */
61/* v16 ~ v23: next 128 bytes */
62v_mask0		.req	v24
63v_mask1		.req	v25
64v_mask2		.req	v26
65v_mask3		.req	v27
66v_gf8poly	.req	v28
67v_0x80		.req	v29
68
69/*
70 * src_ptr_end -->
71 *          -------+----------+
72 *           .     |  src[0]  |
73 *           .     +----------+            +------------------+
74 *     src_ptr --> |  src[1]  | - srcn ->  |     buffer       |
75 *           .     +----------+            +------------------+
76 *           .     |  ......  |
77 *           .     +----------+
78 *           .     | src[v-4] |
79 *          -------+----------+  src_last  +------------------+
80 *        src  --> | src[v-3] | ---------> |      buffer      |
81 *                 +----------+            +------------------+
82 *                 | src[v-2] | - dst_p -> |      buffer      |
83 *                 +----------+            +------------------+
84 *                 | src[v-1] | - dst_q -> |      buffer      | dst_q_end
85 *                 +----------+            +------------------+
86 */
87
88pq_check_neon:
89	sub	x_src_ptr_end, x_src, #8
90
91	sub	w_vects, w_vects, #3
92	add	x_src, x_src, x_vects, lsl #3
93
94	ldr	x_src_last, [x_src]
95	ldp	x_dst_p, x_dst_q, [x_src, #8]
96
97	add	x_dst_q_end, x_dst_q, x_len
98
99	mov	w_min, #-1
100	mov	w_col, #0
101	movi	v_gf8poly.16b, #0x1D
102	movi	v_0x80.16b, #0x80
103
104.Lloop128_init:
105	/* less than 128 byts? */
106	cmp	w_len, #128
107	blo	.Lloop16_init
108
109	/* save d8 ~ d15 to stack */
110	sub	sp, sp, #64
111	stp	d8,  d9,  [sp]
112	stp	d10, d11, [sp, #16]
113	stp	d12, d13, [sp, #32]
114	stp	d14, d15, [sp, #48]
115
116	sub	x_dst_q_end, x_dst_q_end, #128
117
118	/* batch process (vects-2)*128 bytes */
119	/* v0~v7: p;  v8~v15: q;  v16~v23: in */
120.Lloop128:
121	ldr	q0, [x_src_last, #16*0]
122	ldr	q1, [x_src_last, #16*1]
123	ldr	q2, [x_src_last, #16*2]
124	ldr	q3, [x_src_last, #16*3]
125	ldr	q4, [x_src_last, #16*4]
126	ldr	q5, [x_src_last, #16*5]
127	ldr	q6, [x_src_last, #16*6]
128	ldr	q7, [x_src_last, #16*7]
129	add	x_src_last, x_src_last, #128
130
131	mov	v8.16b,  v0.16b
132	mov	v9.16b,  v1.16b
133	mov	v10.16b, v2.16b
134	mov	v11.16b, v3.16b
135	mov	v12.16b, v4.16b
136	mov	v13.16b, v5.16b
137	mov	v14.16b, v6.16b
138	mov	v15.16b, v7.16b
139
140	cbz	w_vects, .Lloop128_vects_end
141
142	sub	x_src_ptr, x_src, #8
143.Lloop128_vects:
144	ldr	x_srcn, [x_src_ptr], #-8
145	add	x_srcn, x_srcn, x_col
146	cmp	x_src_ptr, x_src_ptr_end
147
148	ldr	q16, [x_srcn, #16*0]
149	ldr	q17, [x_srcn, #16*1]
150	ldr	q18, [x_srcn, #16*2]
151	ldr	q19, [x_srcn, #16*3]
152	ldr	q20, [x_srcn, #16*4]
153	ldr	q21, [x_srcn, #16*5]
154	ldr	q22, [x_srcn, #16*6]
155	ldr	q23, [x_srcn, #16*7]
156
157	eor	v0.16b, v0.16b, v16.16b
158	eor	v1.16b, v1.16b, v17.16b
159	eor	v2.16b, v2.16b, v18.16b
160	eor	v3.16b, v3.16b, v19.16b
161	eor	v4.16b, v4.16b, v20.16b
162	eor	v5.16b, v5.16b, v21.16b
163	eor	v6.16b, v6.16b, v22.16b
164	eor	v7.16b, v7.16b, v23.16b
165
166	cmhs	v_mask0.16b, v8.16b,  v_0x80.16b
167	cmhs	v_mask1.16b, v9.16b,  v_0x80.16b
168	cmhs	v_mask2.16b, v10.16b, v_0x80.16b
169	cmhs	v_mask3.16b, v11.16b, v_0x80.16b
170	and	v_mask0.16b, v_mask0.16b, v_gf8poly.16b
171	and	v_mask1.16b, v_mask1.16b, v_gf8poly.16b
172	and	v_mask2.16b, v_mask2.16b, v_gf8poly.16b
173	and	v_mask3.16b, v_mask3.16b, v_gf8poly.16b
174	shl	v8.16b,  v8.16b,  #1
175	shl	v9.16b,  v9.16b,  #1
176	shl	v10.16b, v10.16b, #1
177	shl	v11.16b, v11.16b, #1
178	eor	v8.16b,  v8.16b,  v_mask0.16b
179	eor	v9.16b,  v9.16b,  v_mask1.16b
180	eor	v10.16b, v10.16b, v_mask2.16b
181	eor	v11.16b, v11.16b, v_mask3.16b
182	eor	v8.16b,  v8.16b,  v16.16b
183	eor	v9.16b,  v9.16b,  v17.16b
184	eor	v10.16b, v10.16b, v18.16b
185	eor	v11.16b, v11.16b, v19.16b
186
187	cmhs	v_mask0.16b, v12.16b, v_0x80.16b
188	cmhs	v_mask1.16b, v13.16b, v_0x80.16b
189	cmhs	v_mask2.16b, v14.16b, v_0x80.16b
190	cmhs	v_mask3.16b, v15.16b, v_0x80.16b
191	and	v_mask0.16b, v_mask0.16b, v_gf8poly.16b
192	and	v_mask1.16b, v_mask1.16b, v_gf8poly.16b
193	and	v_mask2.16b, v_mask2.16b, v_gf8poly.16b
194	and	v_mask3.16b, v_mask3.16b, v_gf8poly.16b
195	shl	v12.16b, v12.16b, #1
196	shl	v13.16b, v13.16b, #1
197	shl	v14.16b, v14.16b, #1
198	shl	v15.16b, v15.16b, #1
199	eor	v12.16b, v12.16b, v_mask0.16b
200	eor	v13.16b, v13.16b, v_mask1.16b
201	eor	v14.16b, v14.16b, v_mask2.16b
202	eor	v15.16b, v15.16b, v_mask3.16b
203	eor	v12.16b, v12.16b, v20.16b
204	eor	v13.16b, v13.16b, v21.16b
205	eor	v14.16b, v14.16b, v22.16b
206	eor	v15.16b, v15.16b, v23.16b
207
208	bne	.Lloop128_vects
209
210.Lloop128_vects_end:
211	/* v16~v23: true p, q */
212	ldr	q16, [x_dst_p, #16*0]
213	ldr	q17, [x_dst_p, #16*1]
214	ldr	q18, [x_dst_p, #16*2]
215	ldr	q19, [x_dst_p, #16*3]
216	ldr	q20, [x_dst_p, #16*4]
217	ldr	q21, [x_dst_p, #16*5]
218	ldr	q22, [x_dst_p, #16*6]
219	ldr	q23, [x_dst_p, #16*7]
220
221	cmeq	v0.16b, v0.16b, v16.16b
222	cmeq	v1.16b, v1.16b, v17.16b
223	cmeq	v2.16b, v2.16b, v18.16b
224	cmeq	v3.16b, v3.16b, v19.16b
225	cmeq	v4.16b, v4.16b, v20.16b
226	cmeq	v5.16b, v5.16b, v21.16b
227	cmeq	v6.16b, v6.16b, v22.16b
228	cmeq	v7.16b, v7.16b, v23.16b
229
230	ldr	q16, [x_dst_q, #16*0]
231	ldr	q17, [x_dst_q, #16*1]
232	ldr	q18, [x_dst_q, #16*2]
233	ldr	q19, [x_dst_q, #16*3]
234	ldr	q20, [x_dst_q, #16*4]
235	ldr	q21, [x_dst_q, #16*5]
236	ldr	q22, [x_dst_q, #16*6]
237	ldr	q23, [x_dst_q, #16*7]
238
239	and	v0.16b, v0.16b, v1.16b
240	and	v2.16b, v2.16b, v3.16b
241	and	v4.16b, v4.16b, v5.16b
242	and	v6.16b, v6.16b, v7.16b
243	and	v0.16b, v0.16b, v2.16b
244	and	v4.16b, v4.16b, v6.16b
245	and	v0.16b, v0.16b, v4.16b
246
247	cmeq	v8.16b,  v8.16b,  v16.16b
248	cmeq	v9.16b,  v9.16b,  v17.16b
249	cmeq	v10.16b, v10.16b, v18.16b
250	cmeq	v11.16b, v11.16b, v19.16b
251	cmeq	v12.16b, v12.16b, v20.16b
252	cmeq	v13.16b, v13.16b, v21.16b
253	cmeq	v14.16b, v14.16b, v22.16b
254	cmeq	v15.16b, v15.16b, v23.16b
255
256	and	v8.16b,  v8.16b,  v9.16b
257	and	v10.16b, v10.16b, v11.16b
258	and	v12.16b, v12.16b, v13.16b
259	and	v14.16b, v14.16b, v15.16b
260	and	v8.16b,  v8.16b,  v10.16b
261	and	v12.16b, v12.16b, v14.16b
262	and	v8.16b,  v8.16b,  v12.16b
263
264	and	v0.16b, v0.16b, v8.16b
265
266	uminv	b0, v0.16b
267	umov	w_min, v0.b[0]
268	cbz	w_min, .Lloop128_end
269
270	add	x_dst_p, x_dst_p, #128
271	add	x_dst_q, x_dst_q, #128
272	cmp	x_dst_q, x_dst_q_end
273	add	w_col, w_col, #128
274	bls	.Lloop128
275
276.Lloop128_end:
277	/* restore d8 ~ d15 */
278	ldp	d8,  d9,  [sp]
279	ldp	d10, d11, [sp, #16]
280	ldp	d12, d13, [sp, #32]
281	ldp	d14, d15, [sp, #48]
282	add	sp, sp, #64
283
284	cbz	w_min, .Lerror
285
286	add	x_dst_q_end, x_dst_q_end, #128
287
288.Lloop16_init:
289	tst	w_len, #0x7F
290	beq	.Lloop16_end
291	sub	x_dst_q_end, x_dst_q_end, #16
292
293	/* batch process (vects-2)*16 bytes */
294	/* v0: p;  v1: q;  v2: in;  v3: mask */
295.Lloop16:
296	ldr	q0, [x_src_last], #16
297	mov	v1.16b, v0.16b
298
299	cbz	w_vects, .Lloop16_vects_end
300
301	sub	x_src_ptr, x_src, #8
302.Lloop16_vects:
303	ldr	x_srcn, [x_src_ptr], #-8
304	ldr	q2, [x_srcn, x_col]
305	cmp	x_src_ptr, x_src_ptr_end
306
307	eor	v0.16b, v0.16b, v2.16b
308
309	cmhs	v3.16b, v1.16b, v_0x80.16b
310	and	v3.16b, v3.16b, v_gf8poly.16b
311
312	shl	v1.16b, v1.16b, #1
313	eor	v1.16b, v1.16b, v2.16b
314	eor	v1.16b, v1.16b, v3.16b
315
316	bne	.Lloop16_vects
317
318.Lloop16_vects_end:
319	/* v4: true p;  v5: true q */
320	ldr	q4, [x_dst_p], #16
321	ldr	q5, [x_dst_q], #16
322	cmp	x_dst_q, x_dst_q_end
323
324	cmeq	v0.16b, v0.16b, v4.16b
325	cmeq	v1.16b, v1.16b, v5.16b
326	and	v0.16b, v0.16b, v1.16b
327
328	uminv	b0, v0.16b
329	umov	w_min, v0.b[0]
330	cbz	w_min, .Lerror
331
332	add	w_col, w_col, #16
333	bls	.Lloop16
334
335.Lloop16_end:
336	mov	w_ret, #0
337	ret
338
339.Lerror:
340	mov	w_ret, #1
341	ret
342