1######################################################################## 2# Copyright(c) 2019 Arm Corporation All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# * Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in 11# the documentation and/or other materials provided with the 12# distribution. 13# * Neither the name of Arm Corporation nor the names of its 14# contributors may be used to endorse or promote products derived 15# from this software without specific prior written permission. 16# 17# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28######################################################################### 29 30#include "../include/aarch64_label.h" 31 32 .arch armv8-a+crc+crypto 33 .text 34 .align 3 35 .global cdecl(crc16_t10dif_pmull) 36#ifndef __APPLE__ 37 .type crc16_t10dif_pmull, %function 38#endif 39 40/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */ 41 42/* arguments */ 43w_seed .req w0 44x_buf .req x1 45x_len .req x2 46w_len .req w2 47 48/* returns */ 49w_ret .req w0 50 51/* these as global temporary registers */ 52w_tmp .req w5 53x_tmp .req x5 54x_tmp1 .req x6 55x_tmp2 .req x7 56 57d_tmp1 .req d0 58d_tmp2 .req d1 59q_tmp1 .req q0 60q_tmp2 .req q1 61v_tmp1 .req v0 62v_tmp2 .req v1 63 64/* local variables */ 65w_counter .req w3 66w_crc .req w0 67x_crc .req x0 68x_counter .req x3 69x_crc16tab .req x4 70x_buf_saved .req x0 71 72cdecl(crc16_t10dif_pmull): 73 cmp x_len, 63 74 sub sp, sp, #16 75 uxth w_seed, w_seed 76 bhi .crc_fold 77 78 mov x_tmp, 0 79 mov w_counter, 0 80 81.crc_table_loop_pre: 82 cmp x_len, x_tmp 83 bls .end 84 85#ifndef __APPLE__ 86 sxtw x_counter, w_counter 87 adrp x_crc16tab, .LANCHOR0 88 sub x_buf, x_buf, x_counter 89 add x_crc16tab, x_crc16tab, :lo12:.LANCHOR0 90#else 91 sxtw x_counter, w_counter 92 adrp x_crc16tab, .LANCHOR0@PAGE 93 sub x_buf, x_buf, x_counter 94 add x_crc16tab, x_crc16tab, .LANCHOR0@PAGEOFF 95#endif 96 97 .align 2 98.crc_table_loop: 99 ldrb w_tmp, [x_buf, x_counter] 100 add x_counter, x_counter, 1 101 cmp x_len, x_counter 102 eor w_tmp, w_tmp, w_crc, lsr 8 103 ldrh w_tmp, [x_crc16tab, w_tmp, sxtw 1] 104 eor w_crc, w_tmp, w_crc, lsl 8 105 uxth w_crc, w_crc 106 bhi .crc_table_loop 107 108.end: 109 add sp, sp, 16 110 ret 111 112/* carry less multiplication, part1 - before loop */ 113q_x0 .req q2 114q_x1 .req q3 115q_x2 .req q4 116q_x3 .req q5 117 118v_x0 .req v2 119v_x1 .req v3 120v_x2 .req v4 121v_x3 .req v5 122 123d_x0 .req d2 124d_x1 .req d3 125d_x2 .req d4 126d_x3 .req d5 127 128q_permutation .req q7 129v_permutation .req v7 130 131// the following registers only used this part1 132d_tmp3 .req d16 133v_tmp3 .req v16 134 135 .align 3 136.crc_fold: 137 fmov d_tmp1, x_crc 138 fmov d_tmp2, xzr 139 dup d_tmp3, v_tmp2.d[0] 140 shl d_tmp1, d_tmp1, 48 141 ins v_tmp3.d[1], v_tmp1.d[0] 142 143 and x_counter, x_len, -64 144 sub x_counter, x_counter, #64 145 cmp x_counter, 63 146 add x_buf_saved, x_buf, 64 147 148 ldp q_x0, q_x1, [x_buf] 149 ldp q_x2, q_x3, [x_buf, 32] 150 151#ifndef __APPLE__ 152 adrp x_tmp, .shuffle_mask_lanchor 153 ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor] 154#else 155 adrp x_tmp, .shuffle_mask_lanchor@PAGE 156 ldr q7, [x_tmp, .shuffle_mask_lanchor@PAGEOFF] 157#endif 158 159 tbl v_tmp1.16b, {v_x0.16b}, v7.16b 160 eor v_x0.16b, v_tmp3.16b, v_tmp1.16b 161 162 tbl v_x1.16b, {v_x1.16b}, v7.16b 163 tbl v_x2.16b, {v_x2.16b}, v7.16b 164 tbl v_x3.16b, {v_x3.16b}, v7.16b 165 bls .crc_fold_loop_end 166 167/* carry less multiplication, part2 - loop */ 168q_y0 .req q28 169q_y1 .req q29 170q_y2 .req q30 171q_y3 .req q31 172 173v_y0 .req v28 174v_y1 .req v29 175v_y2 .req v30 176v_y3 .req v31 177 178d_x0_h .req d24 179d_x0_l .req d2 180d_x1_h .req d25 181d_x1_l .req d3 182d_x2_h .req d26 183d_x2_l .req d4 184d_x3_h .req d27 185d_x3_l .req d5 186 187v_x0_h .req v24 188v_x0_l .req v2 189v_x1_h .req v25 190v_x1_l .req v3 191v_x2_h .req v26 192v_x2_l .req v4 193v_x3_h .req v27 194v_x3_l .req v5 195 196v_tmp1_x0 .req v24 197v_tmp1_x1 .req v25 198v_tmp1_x2 .req v26 199v_tmp1_x3 .req v27 200 201q_fold_const .req q17 202v_fold_const .req v17 203 204 ldr q_fold_const, fold_constant 205 206 .align 2 207.crc_fold_loop: 208 add x_buf_saved, x_buf_saved, 64 209 sub x_counter, x_counter, #64 210 cmp x_counter, 63 211 212 ldp q_y0, q_y1, [x_buf_saved, -64] 213 ldp q_y2, q_y3, [x_buf_saved, -32] 214 215 prfm pldl2strm, [x_buf_saved, #1024] 216 prfm pldl2strm, [x_buf_saved, #1088] 217 218 pmull2 v_tmp1_x0.1q, v_x0.2d, v_fold_const.2d 219 pmull v_x0.1q, v_x0.1d, v_fold_const.1d 220 221 pmull2 v_tmp1_x1.1q, v_x1.2d, v_fold_const.2d 222 pmull v_x1.1q, v_x1.1d, v_fold_const.1d 223 224 pmull2 v_tmp1_x2.1q, v_x2.2d, v_fold_const.2d 225 pmull v_x2.1q, v_x2.1d, v_fold_const.1d 226 227 pmull2 v_tmp1_x3.1q, v_x3.2d, v_fold_const.2d 228 pmull v_x3.1q, v_x3.1d, v_fold_const.1d 229 230 tbl v_y0.16b, {v_y0.16b}, v_permutation.16b 231 eor v_x0.16b, v_tmp1_x0.16b, v_x0.16b 232 eor v_x0.16b, v_x0.16b, v_y0.16b 233 234 tbl v_y1.16b, {v_y1.16b}, v_permutation.16b 235 eor v_x1.16b, v_tmp1_x1.16b, v_x1.16b 236 eor v_x1.16b, v_x1.16b, v_y1.16b 237 238 tbl v_y2.16b, {v_y2.16b}, v_permutation.16b 239 eor v_x2.16b, v_tmp1_x2.16b, v_x2.16b 240 eor v_x2.16b, v_x2.16b, v_y2.16b 241 242 tbl v_y3.16b, {v_y3.16b}, v_permutation.16b 243 eor v_x3.16b, v_tmp1_x3.16b, v_x3.16b 244 eor v_x3.16b, v_x3.16b, v_y3.16b 245 246 bhi .crc_fold_loop 247 248/* carry less multiplication, part3 - after loop */ 249/* folding 512bit ---> 128bit */ 250 251// input parameters: 252// v_x0 => v2 253// v_x1 => v3 254// v_x2 => v4 255// v_x3 => v5 256 257// v0, v1, v6, v30, are tmp registers 258 259.crc_fold_loop_end: 260 mov x_tmp, 0x4c1a0000 /* p1 [1] */ 261 fmov d0, x_tmp 262 mov x_tmp, 0xfb0b0000 /* p1 [0] */ 263 fmov d1, x_tmp 264 265 and w_counter, w_len, -64 266 sxtw x_tmp, w_counter 267 add x_buf, x_buf, x_tmp 268 269 dup d6, v_x0.d[1] 270 dup d30, v_x0.d[0] 271 pmull v6.1q, v6.1d, v0.1d 272 pmull v30.1q, v30.1d, v1.1d 273 eor v6.16b, v6.16b, v30.16b 274 eor v_x1.16b, v6.16b, v_x1.16b 275 276 dup d6, v_x1.d[1] 277 dup d30, v_x1.d[0] 278 pmull v6.1q, v6.1d, v0.1d 279 pmull v16.1q, v30.1d, v1.1d 280 eor v6.16b, v6.16b, v16.16b 281 eor v_x2.16b, v6.16b, v_x2.16b 282 283 dup d_x0, v_x2.d[1] 284 dup d30, v_x2.d[0] 285 pmull v0.1q, v_x0.1d, v0.1d 286 pmull v_x0.1q, v30.1d, v1.1d 287 eor v1.16b, v0.16b, v_x0.16b 288 eor v_x0.16b, v1.16b, v_x3.16b 289 290/* carry less multiplication, part3 - after loop */ 291/* crc16 fold function */ 292d_16fold_p0_h .req d18 293v_16fold_p0_h .req v18 294 295d_16fold_p0_l .req d4 296v_16fold_p0_l .req v4 297 298v_16fold_from .req v_x0 299d_16fold_from_h .req d3 300v_16fold_from_h .req v3 301 302v_16fold_zero .req v7 303 304v_16fold_from1 .req v16 305 306v_16fold_from2 .req v0 307d_16fold_from2_h .req d6 308v_16fold_from2_h .req v6 309 310v_16fold_tmp .req v0 311 312 movi v_16fold_zero.4s, 0 313 mov x_tmp1, 0x2d560000 /* p0 [1] */ 314 mov x_tmp2, 0x13680000 /* p0 [0] */ 315 316 ext v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8 317 ext v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4 318 319 dup d_16fold_from_h, v_16fold_from.d[1] 320 fmov d_16fold_p0_h, x_tmp1 321 pmull v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d 322 eor v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b 323 324 dup d_16fold_from2_h, v_16fold_from2.d[1] 325 fmov d_16fold_p0_l, x_tmp2 326 pmull v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d 327 eor v_x0.16b, v0.16b, v6.16b 328 329/* carry less multiplication, part3 - after loop */ 330/* crc16 barrett reduction function */ 331 332// input parameters: 333// v_x0: v2 334// barrett reduction constant: br[0], br[1] 335 336d_br0 .req d3 337v_br0 .req v3 338d_br1 .req d5 339v_br1 .req v5 340 341 mov x_tmp1, 0x57f9 /* br[0] low */ 342 movk x_tmp1, 0xf65a, lsl 16 /* br[0] high */ 343 movk x_tmp1, 0x1, lsl 32 344 fmov d_br0, x_tmp1 345 346 dup d1, v_x0.d[0] 347 dup d1, v1.d[0] 348 ext v1.16b, v1.16b, v7.16b, #4 349 pmull v4.1q, v1.1d, v_br0.1d 350 351 ext v1.16b, v4.16b, v7.16b, #4 352 mov x_tmp1, 0x8bb70000 /* br[1] low */ 353 movk x_tmp1, 0x1, lsl 32 /* br[1] high */ 354 355 fmov d_br1, x_tmp1 356 pmull v_br1.1q, v1.1d, v_br1.1d 357 eor v_x0.16b, v_x0.16b, v_br1.16b 358 359 umov x0, v_x0.d[0] 360 ubfx x0, x0, 16, 16 361 b .crc_table_loop_pre 362 363#ifndef __APPLE__ 364 .size crc16_t10dif_pmull, .-crc16_t10dif_pmull 365#endif 366 367 .align 4 368fold_constant: 369 .word 0x87e70000 370 .word 0x00000000 371 .word 0x371d0000 372 .word 0x00000000 373 374ASM_DEF_RODATA 375.shuffle_mask_lanchor = . + 0 376#ifndef __APPLE__ 377 .type shuffle_mask, %object 378 .size shuffle_mask, 16 379#endif 380shuffle_mask: 381 .byte 15, 14, 13, 12, 11, 10, 9, 8 382 .byte 7, 6, 5, 4, 3, 2, 1, 0 383 384 .align 4 385.LANCHOR0 = . + 0 386#ifndef __APPLE__ 387 .type crc16tab, %object 388 .size crc16tab, 512 389#endif 390crc16tab: 391 .hword 0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b 392 .hword 0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6 393 .hword 0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6 394 .hword 0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b 395 .hword 0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1 396 .hword 0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c 397 .hword 0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c 398 .hword 0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781 399 .hword 0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8 400 .hword 0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255 401 .hword 0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925 402 .hword 0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698 403 .hword 0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472 404 .hword 0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf 405 .hword 0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf 406 .hword 0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02 407 .hword 0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda 408 .hword 0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067 409 .hword 0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17 410 .hword 0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa 411 .hword 0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640 412 .hword 0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd 413 .hword 0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d 414 .hword 0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30 415 .hword 0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759 416 .hword 0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4 417 .hword 0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394 418 .hword 0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29 419 .hword 0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3 420 .hword 0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e 421 .hword 0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e 422 .hword 0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3 423