xref: /isa-l/crc/aarch64/crc16_t10dif_pmull.S (revision 9ab5a9e579c4fb4e2a3c92d73ccd6d97291d0e80)
1########################################################################
2#  Copyright(c) 2019 Arm Corporation All rights reserved.
3#
4#  Redistribution and use in source and binary forms, with or without
5#  modification, are permitted provided that the following conditions
6#  are met:
7#    * Redistributions of source code must retain the above copyright
8#      notice, this list of conditions and the following disclaimer.
9#    * Redistributions in binary form must reproduce the above copyright
10#      notice, this list of conditions and the following disclaimer in
11#      the documentation and/or other materials provided with the
12#      distribution.
13#    * Neither the name of Arm Corporation nor the names of its
14#      contributors may be used to endorse or promote products derived
15#      from this software without specific prior written permission.
16#
17#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#########################################################################
29
30#include "../include/aarch64_label.h"
31
32	.arch armv8-a+crc+crypto
33	.text
34	.align	3
35	.global	cdecl(crc16_t10dif_pmull)
36#ifndef __APPLE__
37	.type	crc16_t10dif_pmull, %function
38#endif
39
40/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
41
42/* arguments */
43w_seed			.req	w0
44x_buf			.req	x1
45x_len			.req	x2
46w_len			.req	w2
47
48/* returns */
49w_ret			.req	w0
50
51/* these as global temporary registers */
52w_tmp			.req	w5
53x_tmp			.req	x5
54x_tmp1			.req	x6
55x_tmp2			.req	x7
56
57d_tmp1			.req	d0
58d_tmp2			.req	d1
59q_tmp1			.req	q0
60q_tmp2			.req	q1
61v_tmp1			.req	v0
62v_tmp2			.req	v1
63
64/* local variables */
65w_counter		.req	w3
66w_crc			.req	w0
67x_crc			.req	x0
68x_counter		.req	x3
69x_crc16tab		.req	x4
70x_buf_saved		.req	x0
71
72cdecl(crc16_t10dif_pmull):
73	cmp	x_len, 63
74	sub	sp, sp, #16
75	uxth	w_seed, w_seed
76	bhi	.crc_fold
77
78	mov	x_tmp, 0
79	mov	w_counter, 0
80
81.crc_table_loop_pre:
82	cmp	x_len, x_tmp
83	bls	.end
84
85#ifndef __APPLE__
86	sxtw	x_counter, w_counter
87	adrp	x_crc16tab, .LANCHOR0
88	sub	x_buf, x_buf, x_counter
89	add	x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
90#else
91	sxtw	x_counter, w_counter
92	adrp	x_crc16tab, .LANCHOR0@PAGE
93	sub	x_buf, x_buf, x_counter
94	add	x_crc16tab, x_crc16tab, .LANCHOR0@PAGEOFF
95#endif
96
97	.align 2
98.crc_table_loop:
99	ldrb	w_tmp, [x_buf, x_counter]
100	add	x_counter, x_counter, 1
101	cmp	x_len, x_counter
102	eor	w_tmp, w_tmp, w_crc, lsr 8
103	ldrh	w_tmp, [x_crc16tab, w_tmp, sxtw 1]
104	eor	w_crc, w_tmp, w_crc, lsl 8
105	uxth	w_crc, w_crc
106	bhi	.crc_table_loop
107
108.end:
109	add	sp, sp, 16
110	ret
111
112/* carry less multiplication, part1 - before loop */
113q_x0			.req	q2
114q_x1			.req	q3
115q_x2			.req	q4
116q_x3			.req	q5
117
118v_x0			.req	v2
119v_x1			.req	v3
120v_x2			.req	v4
121v_x3			.req	v5
122
123d_x0			.req	d2
124d_x1			.req	d3
125d_x2			.req	d4
126d_x3			.req	d5
127
128q_permutation		.req	q7
129v_permutation		.req	v7
130
131// the following registers only used this part1
132d_tmp3			.req	d16
133v_tmp3			.req	v16
134
135	.align 3
136.crc_fold:
137	fmov	d_tmp1, x_crc
138	fmov	d_tmp2, xzr
139	dup	d_tmp3, v_tmp2.d[0]
140	shl	d_tmp1, d_tmp1, 48
141	ins	v_tmp3.d[1], v_tmp1.d[0]
142
143	and	x_counter, x_len, -64
144	sub	x_counter, x_counter, #64
145	cmp	x_counter, 63
146	add	x_buf_saved, x_buf, 64
147
148        ldp q_x0, q_x1, [x_buf]
149        ldp q_x2, q_x3, [x_buf, 32]
150
151#ifndef __APPLE__
152	adrp	x_tmp, .shuffle_mask_lanchor
153	ldr	q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
154#else
155	adrp	x_tmp, .shuffle_mask_lanchor@PAGE
156	ldr	q7, [x_tmp, .shuffle_mask_lanchor@PAGEOFF]
157#endif
158
159	tbl	v_tmp1.16b, {v_x0.16b}, v7.16b
160	eor	v_x0.16b, v_tmp3.16b, v_tmp1.16b
161
162	tbl	v_x1.16b, {v_x1.16b}, v7.16b
163	tbl	v_x2.16b, {v_x2.16b}, v7.16b
164	tbl	v_x3.16b, {v_x3.16b}, v7.16b
165	bls	.crc_fold_loop_end
166
167/* carry less multiplication, part2 - loop */
168q_y0			.req	q28
169q_y1			.req	q29
170q_y2			.req	q30
171q_y3			.req	q31
172
173v_y0			.req	v28
174v_y1			.req	v29
175v_y2			.req	v30
176v_y3			.req	v31
177
178d_x0_h			.req	d24
179d_x0_l			.req	d2
180d_x1_h			.req	d25
181d_x1_l			.req	d3
182d_x2_h			.req	d26
183d_x2_l			.req	d4
184d_x3_h			.req	d27
185d_x3_l			.req	d5
186
187v_x0_h			.req	v24
188v_x0_l			.req	v2
189v_x1_h			.req	v25
190v_x1_l			.req	v3
191v_x2_h			.req	v26
192v_x2_l			.req	v4
193v_x3_h			.req	v27
194v_x3_l			.req	v5
195
196v_tmp1_x0		.req	v24
197v_tmp1_x1		.req	v25
198v_tmp1_x2		.req	v26
199v_tmp1_x3		.req	v27
200
201q_fold_const		.req	q17
202v_fold_const		.req	v17
203
204        ldr q_fold_const, fold_constant
205
206	.align 2
207.crc_fold_loop:
208	add	x_buf_saved, x_buf_saved, 64
209	sub	x_counter, x_counter, #64
210	cmp	x_counter, 63
211
212        ldp q_y0, q_y1, [x_buf_saved, -64]
213        ldp q_y2, q_y3, [x_buf_saved, -32]
214
215        prfm pldl2strm,  [x_buf_saved, #1024]
216        prfm pldl2strm,  [x_buf_saved, #1088]
217
218        pmull2 v_tmp1_x0.1q, v_x0.2d, v_fold_const.2d
219        pmull v_x0.1q, v_x0.1d, v_fold_const.1d
220
221        pmull2 v_tmp1_x1.1q, v_x1.2d, v_fold_const.2d
222        pmull v_x1.1q, v_x1.1d, v_fold_const.1d
223
224        pmull2 v_tmp1_x2.1q, v_x2.2d, v_fold_const.2d
225        pmull v_x2.1q, v_x2.1d, v_fold_const.1d
226
227        pmull2 v_tmp1_x3.1q, v_x3.2d, v_fold_const.2d
228        pmull v_x3.1q, v_x3.1d, v_fold_const.1d
229
230        tbl v_y0.16b, {v_y0.16b}, v_permutation.16b
231        eor v_x0.16b, v_tmp1_x0.16b, v_x0.16b
232        eor v_x0.16b, v_x0.16b, v_y0.16b
233
234        tbl v_y1.16b, {v_y1.16b}, v_permutation.16b
235        eor v_x1.16b, v_tmp1_x1.16b, v_x1.16b
236        eor v_x1.16b, v_x1.16b, v_y1.16b
237
238        tbl v_y2.16b, {v_y2.16b}, v_permutation.16b
239        eor v_x2.16b, v_tmp1_x2.16b, v_x2.16b
240        eor v_x2.16b, v_x2.16b, v_y2.16b
241
242        tbl v_y3.16b, {v_y3.16b}, v_permutation.16b
243        eor v_x3.16b, v_tmp1_x3.16b, v_x3.16b
244        eor v_x3.16b, v_x3.16b, v_y3.16b
245
246	bhi	.crc_fold_loop
247
248/* carry less multiplication, part3 - after loop */
249/* folding 512bit ---> 128bit */
250
251// input parameters:
252// v_x0 => v2
253// v_x1 => v3
254// v_x2 => v4
255// v_x3 => v5
256
257// v0, v1, v6, v30, are tmp registers
258
259.crc_fold_loop_end:
260	mov	x_tmp, 0x4c1a0000	/* p1 [1] */
261	fmov	d0, x_tmp
262	mov	x_tmp, 0xfb0b0000	/* p1 [0] */
263	fmov	d1, x_tmp
264
265	and	w_counter, w_len, -64
266	sxtw	x_tmp, w_counter
267	add	x_buf, x_buf, x_tmp
268
269	dup	d6, v_x0.d[1]
270	dup	d30, v_x0.d[0]
271	pmull	v6.1q, v6.1d, v0.1d
272	pmull	v30.1q, v30.1d, v1.1d
273	eor	v6.16b, v6.16b, v30.16b
274	eor	v_x1.16b, v6.16b, v_x1.16b
275
276	dup	d6, v_x1.d[1]
277	dup	d30, v_x1.d[0]
278	pmull	v6.1q, v6.1d, v0.1d
279	pmull	v16.1q, v30.1d, v1.1d
280	eor	v6.16b, v6.16b, v16.16b
281	eor	v_x2.16b, v6.16b, v_x2.16b
282
283	dup	d_x0, v_x2.d[1]
284	dup	d30, v_x2.d[0]
285	pmull	v0.1q, v_x0.1d, v0.1d
286	pmull	v_x0.1q, v30.1d, v1.1d
287	eor	v1.16b, v0.16b, v_x0.16b
288	eor	v_x0.16b, v1.16b, v_x3.16b
289
290/* carry less multiplication, part3 - after loop */
291/* crc16 fold function */
292d_16fold_p0_h		.req	d18
293v_16fold_p0_h		.req	v18
294
295d_16fold_p0_l		.req	d4
296v_16fold_p0_l		.req	v4
297
298v_16fold_from		.req	v_x0
299d_16fold_from_h		.req	d3
300v_16fold_from_h		.req	v3
301
302v_16fold_zero		.req	v7
303
304v_16fold_from1		.req	v16
305
306v_16fold_from2		.req	v0
307d_16fold_from2_h	.req	d6
308v_16fold_from2_h	.req	v6
309
310v_16fold_tmp		.req	v0
311
312	movi	v_16fold_zero.4s, 0
313	mov	x_tmp1, 0x2d560000		/* p0 [1] */
314	mov	x_tmp2, 0x13680000		/* p0 [0] */
315
316	ext	v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
317	ext	v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
318
319	dup	d_16fold_from_h, v_16fold_from.d[1]
320	fmov	d_16fold_p0_h, x_tmp1
321	pmull	v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
322	eor	v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
323
324	dup	d_16fold_from2_h, v_16fold_from2.d[1]
325	fmov	d_16fold_p0_l, x_tmp2
326	pmull	v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
327	eor	v_x0.16b, v0.16b, v6.16b
328
329/* carry less multiplication, part3 - after loop */
330/* crc16 barrett reduction function */
331
332// input parameters:
333// v_x0:			v2
334// barrett reduction constant:	br[0], br[1]
335
336d_br0			.req	d3
337v_br0			.req	v3
338d_br1			.req	d5
339v_br1			.req	v5
340
341	mov	x_tmp1, 0x57f9			/* br[0] low */
342	movk	x_tmp1, 0xf65a, lsl 16		/* br[0] high */
343	movk	x_tmp1, 0x1, lsl 32
344	fmov	d_br0, x_tmp1
345
346	dup	d1, v_x0.d[0]
347	dup	d1, v1.d[0]
348	ext	v1.16b, v1.16b, v7.16b, #4
349	pmull	v4.1q, v1.1d, v_br0.1d
350
351	ext	v1.16b, v4.16b, v7.16b, #4
352	mov	x_tmp1, 0x8bb70000		/* br[1] low */
353	movk	x_tmp1, 0x1, lsl 32		/* br[1] high */
354
355	fmov	d_br1, x_tmp1
356	pmull	v_br1.1q, v1.1d, v_br1.1d
357	eor	v_x0.16b, v_x0.16b, v_br1.16b
358
359	umov	x0, v_x0.d[0]
360	ubfx	x0, x0, 16, 16
361	b	.crc_table_loop_pre
362
363#ifndef __APPLE__
364	.size	crc16_t10dif_pmull, .-crc16_t10dif_pmull
365#endif
366
367	.align	4
368fold_constant:
369	.word 0x87e70000
370	.word 0x00000000
371	.word 0x371d0000
372	.word 0x00000000
373
374ASM_DEF_RODATA
375.shuffle_mask_lanchor = . + 0
376#ifndef __APPLE__
377	.type	shuffle_mask, %object
378	.size	shuffle_mask, 16
379#endif
380shuffle_mask:
381	.byte	15, 14, 13, 12, 11, 10, 9, 8
382	.byte	7,   6,  5,  4,  3,  2, 1, 0
383
384	.align	4
385.LANCHOR0 = . + 0
386#ifndef __APPLE__
387	.type	crc16tab, %object
388	.size	crc16tab, 512
389#endif
390crc16tab:
391	.hword  0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
392	.hword  0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
393	.hword  0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
394	.hword  0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
395	.hword  0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
396	.hword  0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
397	.hword  0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
398	.hword  0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
399	.hword  0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
400	.hword  0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
401	.hword  0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
402	.hword  0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
403	.hword  0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
404	.hword  0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
405	.hword  0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
406	.hword  0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
407	.hword  0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
408	.hword  0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
409	.hword  0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
410	.hword  0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
411	.hword  0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
412	.hword  0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
413	.hword  0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
414	.hword  0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
415	.hword  0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
416	.hword  0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
417	.hword  0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
418	.hword  0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
419	.hword  0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
420	.hword  0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
421	.hword  0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
422	.hword  0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3
423