1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) ARM Limited. 2021 All rights reserved. 3 */ 4 5 #ifndef __aarch64__ 6 #error Unsupported hardware 7 #endif 8 9 #include "spdk/stdinc.h" 10 #include <arm_sve.h> 11 12 static int 13 table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3, 14 svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7, 15 svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl) 16 { 17 svuint8_t res2, res3, res4, res5, res6, res7; 18 19 /* 20 * In base64 decode table, the first 32 elements are invalid value, 21 * so skip tbl_vec0 and tbl_vec1 22 */ 23 indices = svsub_n_u8_z(p8_in, indices, 2 * vl); 24 res2 = svtbl_u8(tbl_vec2, indices); 25 indices = svsub_n_u8_z(p8_in, indices, vl); 26 res3 = svtbl_u8(tbl_vec3, indices); 27 indices = svsub_n_u8_z(p8_in, indices, vl); 28 res4 = svtbl_u8(tbl_vec4, indices); 29 indices = svsub_n_u8_z(p8_in, indices, vl); 30 res5 = svtbl_u8(tbl_vec5, indices); 31 indices = svsub_n_u8_z(p8_in, indices, vl); 32 res6 = svtbl_u8(tbl_vec6, indices); 33 indices = svsub_n_u8_z(p8_in, indices, vl); 34 res7 = svtbl_u8(tbl_vec7, indices); 35 36 *output = svdup_n_u8(0); 37 *output = svadd_u8_z(p8_in, res2, *output); 38 *output = svadd_u8_z(p8_in, res3, *output); 39 *output = svadd_u8_z(p8_in, res4, *output); 40 *output = svadd_u8_z(p8_in, res5, *output); 41 *output = svadd_u8_z(p8_in, res6, *output); 42 *output = svadd_u8_z(p8_in, res7, *output); 43 44 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 45 return -1; 46 } 47 48 return 0; 49 } 50 51 static int 52 table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3, 53 svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl) 54 { 55 svuint8_t res0, res1, res2, res3; 56 57 res0 = svtbl_u8(tbl_vec0, indices); 58 indices = svsub_n_u8_z(p8_in, indices, vl); 59 res1 = svtbl_u8(tbl_vec1, indices); 60 indices = svsub_n_u8_z(p8_in, indices, vl); 61 res2 = svtbl_u8(tbl_vec2, indices); 62 indices = svsub_n_u8_z(p8_in, indices, vl); 63 res3 = svtbl_u8(tbl_vec3, indices); 64 65 *output = svdup_n_u8(0); 66 67 *output = svadd_u8_z(p8_in, res0, *output); 68 *output = svadd_u8_z(p8_in, res1, *output); 69 *output = svadd_u8_z(p8_in, res2, *output); 70 *output = svadd_u8_z(p8_in, res3, *output); 71 72 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 73 return -1; 74 } 75 76 return 0; 77 } 78 79 static int 80 table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices, 81 svuint8_t *output, svbool_t p8_in, uint64_t vl) 82 { 83 svuint8_t res0, res1, res2; 84 85 res0 = svtbl_u8(tbl_vec0, indices); 86 indices = svsub_n_u8_z(p8_in, indices, vl); 87 res1 = svtbl_u8(tbl_vec1, indices); 88 indices = svsub_n_u8_z(p8_in, indices, vl); 89 res2 = svtbl_u8(tbl_vec2, indices); 90 91 *output = svdup_n_u8(0); 92 93 *output = svadd_u8_z(p8_in, res0, *output); 94 *output = svadd_u8_z(p8_in, res1, *output); 95 *output = svadd_u8_z(p8_in, res2, *output); 96 97 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 98 return -1; 99 } 100 101 return 0; 102 } 103 104 static int 105 table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output, 106 svbool_t p8_in, uint64_t vl) 107 { 108 svuint8_t res0, res1; 109 110 res0 = svtbl_u8(tbl_vec0, indices); 111 indices = svsub_n_u8_z(p8_in, indices, vl); 112 res1 = svtbl_u8(tbl_vec1, indices); 113 114 *output = svdup_n_u8(0); 115 116 *output = svadd_u8_z(p8_in, res0, *output); 117 *output = svadd_u8_z(p8_in, res1, *output); 118 119 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 120 return -1; 121 } 122 123 return 0; 124 } 125 126 static inline void 127 convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1, 128 svuint8_t *temp2, svuint8_t *temp3) 129 { 130 svuint8_t str0, str1, str2; 131 svuint8x3_t ld_enc_input; 132 133 ld_enc_input = svld3_u8(pred, src); 134 135 str0 = svget3_u8(ld_enc_input, 0); 136 str1 = svget3_u8(ld_enc_input, 1); 137 str2 = svget3_u8(ld_enc_input, 2); 138 139 140 *temp0 = svlsr_n_u8_z(pred, str0, 2); 141 *temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0, 142 4)), 143 svdup_u8(0x3F)); 144 *temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1, 145 2)), 146 svdup_u8(0x3F)); 147 *temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F)); 148 } 149 150 static inline void 151 convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2, 152 svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2) 153 { 154 *output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4)); 155 *output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2)); 156 *output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3); 157 } 158 159 static void 160 base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len) 161 { 162 uint64_t vl = svcntb(); 163 svuint8_t temp0, temp1, temp2, temp3; 164 svuint8_t output0, output1, output2, output3; 165 svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3; 166 svuint8x4_t st_enc_output; 167 svbool_t p8_all = svptrue_b8(); 168 svbool_t pred; 169 uint64_t i = 0; 170 uint64_t pred_count = 0; 171 uint64_t N = (*src_len / 3) * 3; 172 173 if (vl == 16) { 174 175 tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0); 176 tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16); 177 tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32); 178 tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48); 179 180 while (i < N) { 181 pred = svwhilelt_b8(i / 3, N / 3); 182 183 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3); 184 185 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl); 186 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl); 187 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl); 188 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl); 189 190 st_enc_output = svcreate4_u8(output0, output1, output2, output3); 191 svst4_u8(pred, (uint8_t *)*dst, st_enc_output); 192 193 pred_count = svcntp_b8(pred, pred); 194 *src += pred_count * 3; 195 *dst += pred_count * 4; 196 *src_len -= pred_count * 3; 197 i += pred_count * 3; 198 199 } 200 } else if (vl == 32 || vl == 48) { 201 202 tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0); 203 pred = svwhilelt_b8(vl, (uint64_t)64); 204 tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl); 205 206 while (i < N) { 207 pred = svwhilelt_b8(i / 3, N / 3); 208 209 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3); 210 211 table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl); 212 table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl); 213 table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl); 214 table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl); 215 216 st_enc_output = svcreate4_u8(output0, output1, output2, output3); 217 svst4_u8(pred, (uint8_t *)*dst, st_enc_output); 218 219 pred_count = svcntp_b8(pred, pred); 220 *src += pred_count * 3; 221 *dst += pred_count * 4; 222 *src_len -= pred_count * 3; 223 i += pred_count * 3; 224 225 } 226 } else if (vl >= 64) { 227 228 pred = svwhilelt_b8((uint64_t)0, (uint64_t)64); 229 tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table); 230 231 while (i < N) { 232 pred = svwhilelt_b8(i / 3, N / 3); 233 234 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3); 235 236 output0 = svtbl_u8(tbl_enc0, temp0); 237 output1 = svtbl_u8(tbl_enc0, temp1); 238 output2 = svtbl_u8(tbl_enc0, temp2); 239 output3 = svtbl_u8(tbl_enc0, temp3); 240 241 st_enc_output = svcreate4_u8(output0, output1, output2, output3); 242 svst4_u8(pred, (uint8_t *)*dst, st_enc_output); 243 244 pred_count = svcntp_b8(pred, pred); 245 *src += pred_count * 3; 246 *dst += pred_count * 4; 247 *src_len -= pred_count * 3; 248 i += pred_count * 3; 249 250 } 251 } 252 } 253 254 static void 255 base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len) 256 { 257 uint64_t vl = svcntb(); 258 svuint8_t str0, str1, str2, str3; 259 svuint8_t temp0, temp1, temp2, temp3; 260 svuint8_t output0, output1, output2; 261 svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7; 262 svuint8x3_t st_dec_output; 263 svbool_t p8_all = svptrue_b8(); 264 svbool_t pred; 265 uint64_t i = 0; 266 uint64_t pred_count = 0; 267 uint64_t N = (*src_len / 4) * 4; 268 svuint8x4_t ld_dec_input; 269 270 if (vl == 16) { 271 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 272 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16); 273 tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32); 274 tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48); 275 tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64); 276 tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80); 277 tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96); 278 tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112); 279 280 while (i < N) { 281 pred = svwhilelt_b8(i / 4, N / 4); 282 283 ld_dec_input = svld4_u8(pred, *src); 284 285 str0 = svget4_u8(ld_dec_input, 0); 286 str1 = svget4_u8(ld_dec_input, 1); 287 str2 = svget4_u8(ld_dec_input, 2); 288 str3 = svget4_u8(ld_dec_input, 3); 289 290 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 291 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 292 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 293 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 294 295 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 296 tbl_dec7, str0, &temp0, pred, vl)) { return; } 297 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 298 tbl_dec7, str1, &temp1, pred, vl)) { return; } 299 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 300 tbl_dec7, str2, &temp2, pred, vl)) { return; } 301 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 302 tbl_dec7, str3, &temp3, pred, vl)) { return; } 303 304 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 305 306 st_dec_output = svcreate3_u8(output0, output1, output2); 307 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 308 309 pred_count = svcntp_b8(pred, pred); 310 *src += pred_count * 4; 311 *dst += pred_count * 3; 312 *src_len -= pred_count * 4; 313 i += pred_count * 4; 314 315 } 316 } else if (vl == 32) { 317 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 318 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl); 319 tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2); 320 tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3); 321 322 while (i < N) { 323 pred = svwhilelt_b8(i / 4, N / 4); 324 325 ld_dec_input = svld4_u8(pred, *src); 326 327 str0 = svget4_u8(ld_dec_input, 0); 328 str1 = svget4_u8(ld_dec_input, 1); 329 str2 = svget4_u8(ld_dec_input, 2); 330 str3 = svget4_u8(ld_dec_input, 3); 331 332 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 333 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 334 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 335 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 336 337 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; } 338 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; } 339 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; } 340 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; } 341 342 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 343 344 st_dec_output = svcreate3_u8(output0, output1, output2); 345 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 346 347 pred_count = svcntp_b8(pred, pred); 348 *src += pred_count * 4; 349 *dst += pred_count * 3; 350 *src_len -= pred_count * 4; 351 i += pred_count * 4; 352 353 } 354 355 } else if (vl == 48) { 356 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 357 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl); 358 pred = svwhilelt_b8(vl * 2, (uint64_t)128); 359 tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl); 360 361 while (i < N) { 362 pred = svwhilelt_b8(i / 4, N / 4); 363 364 ld_dec_input = svld4_u8(pred, *src); 365 366 str0 = svget4_u8(ld_dec_input, 0); 367 str1 = svget4_u8(ld_dec_input, 1); 368 str2 = svget4_u8(ld_dec_input, 2); 369 str3 = svget4_u8(ld_dec_input, 3); 370 371 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 372 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 373 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 374 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 375 376 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; } 377 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; } 378 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; } 379 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; } 380 381 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 382 383 st_dec_output = svcreate3_u8(output0, output1, output2); 384 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 385 386 pred_count = svcntp_b8(pred, pred); 387 *src += pred_count * 4; 388 *dst += pred_count * 3; 389 *src_len -= pred_count * 4; 390 i += pred_count * 4; 391 392 } 393 } else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) { 394 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 395 pred = svwhilelt_b8(vl, (uint64_t)128); 396 tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl); 397 398 while (i < N) { 399 pred = svwhilelt_b8(i / 4, N / 4); 400 401 ld_dec_input = svld4_u8(pred, *src); 402 403 str0 = svget4_u8(ld_dec_input, 0); 404 str1 = svget4_u8(ld_dec_input, 1); 405 str2 = svget4_u8(ld_dec_input, 2); 406 str3 = svget4_u8(ld_dec_input, 3); 407 408 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 409 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 410 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 411 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 412 413 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; } 414 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; } 415 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; } 416 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; } 417 418 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 419 420 st_dec_output = svcreate3_u8(output0, output1, output2); 421 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 422 423 pred_count = svcntp_b8(pred, pred); 424 *src += pred_count * 4; 425 *dst += pred_count * 3; 426 *src_len -= pred_count * 4; 427 i += pred_count * 4; 428 429 } 430 } else if (vl >= 128) { 431 pred = svwhilelt_b8((uint64_t)0, (uint64_t)128); 432 tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0); 433 434 while (i < N) { 435 pred = svwhilelt_b8(i / 4, N / 4); 436 437 ld_dec_input = svld4_u8(pred, *src); 438 439 str0 = svget4_u8(ld_dec_input, 0); 440 str1 = svget4_u8(ld_dec_input, 1); 441 str2 = svget4_u8(ld_dec_input, 2); 442 str3 = svget4_u8(ld_dec_input, 3); 443 444 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 445 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 446 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 447 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 448 449 temp0 = svtbl_u8(tbl_dec0, str0); 450 temp1 = svtbl_u8(tbl_dec0, str1); 451 temp2 = svtbl_u8(tbl_dec0, str2); 452 temp3 = svtbl_u8(tbl_dec0, str3); 453 454 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; } 455 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; } 456 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; } 457 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; } 458 459 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 460 461 st_dec_output = svcreate3_u8(output0, output1, output2); 462 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 463 464 pred_count = svcntp_b8(pred, pred); 465 *src += pred_count * 4; 466 *dst += pred_count * 3; 467 *src_len -= pred_count * 4; 468 i += pred_count * 4; 469 470 } 471 } 472 } 473