1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) ARM Limited. 2021 All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are 8 * met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * * Neither the name of Intel Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 21 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #ifndef __aarch64__ 34 #error Unsupported hardware 35 #endif 36 37 #include "spdk/stdinc.h" 38 #include <arm_sve.h> 39 40 static int 41 table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3, 42 svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7, 43 svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl) 44 { 45 svuint8_t res2, res3, res4, res5, res6, res7; 46 47 /* 48 * In base64 decode table, the first 32 elements are invalid value, 49 * so skip tbl_vec0 and tbl_vec1 50 */ 51 indices = svsub_n_u8_z(p8_in, indices, 2 * vl); 52 res2 = svtbl_u8(tbl_vec2, indices); 53 indices = svsub_n_u8_z(p8_in, indices, vl); 54 res3 = svtbl_u8(tbl_vec3, indices); 55 indices = svsub_n_u8_z(p8_in, indices, vl); 56 res4 = svtbl_u8(tbl_vec4, indices); 57 indices = svsub_n_u8_z(p8_in, indices, vl); 58 res5 = svtbl_u8(tbl_vec5, indices); 59 indices = svsub_n_u8_z(p8_in, indices, vl); 60 res6 = svtbl_u8(tbl_vec6, indices); 61 indices = svsub_n_u8_z(p8_in, indices, vl); 62 res7 = svtbl_u8(tbl_vec7, indices); 63 64 *output = svdup_n_u8(0); 65 *output = svadd_u8_z(p8_in, res2, *output); 66 *output = svadd_u8_z(p8_in, res3, *output); 67 *output = svadd_u8_z(p8_in, res4, *output); 68 *output = svadd_u8_z(p8_in, res5, *output); 69 *output = svadd_u8_z(p8_in, res6, *output); 70 *output = svadd_u8_z(p8_in, res7, *output); 71 72 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 73 return -1; 74 } 75 76 return 0; 77 } 78 79 static int 80 table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3, 81 svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl) 82 { 83 svuint8_t res0, res1, res2, res3; 84 85 res0 = svtbl_u8(tbl_vec0, indices); 86 indices = svsub_n_u8_z(p8_in, indices, vl); 87 res1 = svtbl_u8(tbl_vec1, indices); 88 indices = svsub_n_u8_z(p8_in, indices, vl); 89 res2 = svtbl_u8(tbl_vec2, indices); 90 indices = svsub_n_u8_z(p8_in, indices, vl); 91 res3 = svtbl_u8(tbl_vec3, indices); 92 93 *output = svdup_n_u8(0); 94 95 *output = svadd_u8_z(p8_in, res0, *output); 96 *output = svadd_u8_z(p8_in, res1, *output); 97 *output = svadd_u8_z(p8_in, res2, *output); 98 *output = svadd_u8_z(p8_in, res3, *output); 99 100 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 101 return -1; 102 } 103 104 return 0; 105 } 106 107 static int 108 table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices, 109 svuint8_t *output, svbool_t p8_in, uint64_t vl) 110 { 111 svuint8_t res0, res1, res2; 112 113 res0 = svtbl_u8(tbl_vec0, indices); 114 indices = svsub_n_u8_z(p8_in, indices, vl); 115 res1 = svtbl_u8(tbl_vec1, indices); 116 indices = svsub_n_u8_z(p8_in, indices, vl); 117 res2 = svtbl_u8(tbl_vec2, indices); 118 119 *output = svdup_n_u8(0); 120 121 *output = svadd_u8_z(p8_in, res0, *output); 122 *output = svadd_u8_z(p8_in, res1, *output); 123 *output = svadd_u8_z(p8_in, res2, *output); 124 125 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 126 return -1; 127 } 128 129 return 0; 130 } 131 132 static int 133 table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output, 134 svbool_t p8_in, uint64_t vl) 135 { 136 svuint8_t res0, res1; 137 138 res0 = svtbl_u8(tbl_vec0, indices); 139 indices = svsub_n_u8_z(p8_in, indices, vl); 140 res1 = svtbl_u8(tbl_vec1, indices); 141 142 *output = svdup_n_u8(0); 143 144 *output = svadd_u8_z(p8_in, res0, *output); 145 *output = svadd_u8_z(p8_in, res1, *output); 146 147 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) { 148 return -1; 149 } 150 151 return 0; 152 } 153 154 static inline void 155 convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1, 156 svuint8_t *temp2, svuint8_t *temp3) 157 { 158 svuint8_t str0, str1, str2; 159 svuint8x3_t ld_enc_input; 160 161 ld_enc_input = svld3_u8(pred, src); 162 163 str0 = svget3_u8(ld_enc_input, 0); 164 str1 = svget3_u8(ld_enc_input, 1); 165 str2 = svget3_u8(ld_enc_input, 2); 166 167 168 *temp0 = svlsr_n_u8_z(pred, str0, 2); 169 *temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0, 170 4)), 171 svdup_u8(0x3F)); 172 *temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1, 173 2)), 174 svdup_u8(0x3F)); 175 *temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F)); 176 } 177 178 static inline void 179 convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2, 180 svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2) 181 { 182 *output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4)); 183 *output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2)); 184 *output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3); 185 } 186 187 static void 188 base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len) 189 { 190 uint64_t vl = svcntb(); 191 svuint8_t temp0, temp1, temp2, temp3; 192 svuint8_t output0, output1, output2, output3; 193 svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3; 194 svuint8x4_t st_enc_output; 195 svbool_t p8_all = svptrue_b8(); 196 svbool_t pred; 197 uint64_t i = 0; 198 uint64_t pred_count = 0; 199 uint64_t N = (*src_len / 3) * 3; 200 201 if (vl == 16) { 202 203 tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0); 204 tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16); 205 tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32); 206 tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48); 207 208 while (i < N) { 209 pred = svwhilelt_b8(i / 3, N / 3); 210 211 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3); 212 213 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl); 214 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl); 215 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl); 216 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl); 217 218 st_enc_output = svcreate4_u8(output0, output1, output2, output3); 219 svst4_u8(pred, (uint8_t *)*dst, st_enc_output); 220 221 pred_count = svcntp_b8(pred, pred); 222 *src += pred_count * 3; 223 *dst += pred_count * 4; 224 *src_len -= pred_count * 3; 225 i += pred_count * 3; 226 227 } 228 } else if (vl == 32 || vl == 48) { 229 230 tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0); 231 pred = svwhilelt_b8(vl, (uint64_t)64); 232 tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl); 233 234 while (i < N) { 235 pred = svwhilelt_b8(i / 3, N / 3); 236 237 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3); 238 239 table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl); 240 table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl); 241 table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl); 242 table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl); 243 244 st_enc_output = svcreate4_u8(output0, output1, output2, output3); 245 svst4_u8(pred, (uint8_t *)*dst, st_enc_output); 246 247 pred_count = svcntp_b8(pred, pred); 248 *src += pred_count * 3; 249 *dst += pred_count * 4; 250 *src_len -= pred_count * 3; 251 i += pred_count * 3; 252 253 } 254 } else if (vl >= 64) { 255 256 pred = svwhilelt_b8((uint64_t)0, (uint64_t)64); 257 tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table); 258 259 while (i < N) { 260 pred = svwhilelt_b8(i / 3, N / 3); 261 262 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3); 263 264 output0 = svtbl_u8(tbl_enc0, temp0); 265 output1 = svtbl_u8(tbl_enc0, temp1); 266 output2 = svtbl_u8(tbl_enc0, temp2); 267 output3 = svtbl_u8(tbl_enc0, temp3); 268 269 st_enc_output = svcreate4_u8(output0, output1, output2, output3); 270 svst4_u8(pred, (uint8_t *)*dst, st_enc_output); 271 272 pred_count = svcntp_b8(pred, pred); 273 *src += pred_count * 3; 274 *dst += pred_count * 4; 275 *src_len -= pred_count * 3; 276 i += pred_count * 3; 277 278 } 279 } 280 } 281 282 static void 283 base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len) 284 { 285 uint64_t vl = svcntb(); 286 svuint8_t str0, str1, str2, str3; 287 svuint8_t temp0, temp1, temp2, temp3; 288 svuint8_t output0, output1, output2; 289 svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7; 290 svuint8x3_t st_dec_output; 291 svbool_t p8_all = svptrue_b8(); 292 svbool_t pred; 293 uint64_t i = 0; 294 uint64_t pred_count = 0; 295 uint64_t N = (*src_len / 4) * 4; 296 svuint8x4_t ld_dec_input; 297 298 if (vl == 16) { 299 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 300 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16); 301 tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32); 302 tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48); 303 tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64); 304 tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80); 305 tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96); 306 tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112); 307 308 while (i < N) { 309 pred = svwhilelt_b8(i / 4, N / 4); 310 311 ld_dec_input = svld4_u8(pred, *src); 312 313 str0 = svget4_u8(ld_dec_input, 0); 314 str1 = svget4_u8(ld_dec_input, 1); 315 str2 = svget4_u8(ld_dec_input, 2); 316 str3 = svget4_u8(ld_dec_input, 3); 317 318 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 319 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 320 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 321 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 322 323 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 324 tbl_dec7, str0, &temp0, pred, vl)) { return; } 325 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 326 tbl_dec7, str1, &temp1, pred, vl)) { return; } 327 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 328 tbl_dec7, str2, &temp2, pred, vl)) { return; } 329 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, 330 tbl_dec7, str3, &temp3, pred, vl)) { return; } 331 332 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 333 334 st_dec_output = svcreate3_u8(output0, output1, output2); 335 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 336 337 pred_count = svcntp_b8(pred, pred); 338 *src += pred_count * 4; 339 *dst += pred_count * 3; 340 *src_len -= pred_count * 4; 341 i += pred_count * 4; 342 343 } 344 } else if (vl == 32) { 345 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 346 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl); 347 tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2); 348 tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3); 349 350 while (i < N) { 351 pred = svwhilelt_b8(i / 4, N / 4); 352 353 ld_dec_input = svld4_u8(pred, *src); 354 355 str0 = svget4_u8(ld_dec_input, 0); 356 str1 = svget4_u8(ld_dec_input, 1); 357 str2 = svget4_u8(ld_dec_input, 2); 358 str3 = svget4_u8(ld_dec_input, 3); 359 360 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 361 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 362 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 363 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 364 365 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; } 366 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; } 367 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; } 368 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; } 369 370 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 371 372 st_dec_output = svcreate3_u8(output0, output1, output2); 373 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 374 375 pred_count = svcntp_b8(pred, pred); 376 *src += pred_count * 4; 377 *dst += pred_count * 3; 378 *src_len -= pred_count * 4; 379 i += pred_count * 4; 380 381 } 382 383 } else if (vl == 48) { 384 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 385 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl); 386 pred = svwhilelt_b8(vl * 2, (uint64_t)128); 387 tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl); 388 389 while (i < N) { 390 pred = svwhilelt_b8(i / 4, N / 4); 391 392 ld_dec_input = svld4_u8(pred, *src); 393 394 str0 = svget4_u8(ld_dec_input, 0); 395 str1 = svget4_u8(ld_dec_input, 1); 396 str2 = svget4_u8(ld_dec_input, 2); 397 str3 = svget4_u8(ld_dec_input, 3); 398 399 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 400 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 401 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 402 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 403 404 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; } 405 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; } 406 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; } 407 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; } 408 409 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 410 411 st_dec_output = svcreate3_u8(output0, output1, output2); 412 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 413 414 pred_count = svcntp_b8(pred, pred); 415 *src += pred_count * 4; 416 *dst += pred_count * 3; 417 *src_len -= pred_count * 4; 418 i += pred_count * 4; 419 420 } 421 } else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) { 422 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0); 423 pred = svwhilelt_b8(vl, (uint64_t)128); 424 tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl); 425 426 while (i < N) { 427 pred = svwhilelt_b8(i / 4, N / 4); 428 429 ld_dec_input = svld4_u8(pred, *src); 430 431 str0 = svget4_u8(ld_dec_input, 0); 432 str1 = svget4_u8(ld_dec_input, 1); 433 str2 = svget4_u8(ld_dec_input, 2); 434 str3 = svget4_u8(ld_dec_input, 3); 435 436 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 437 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 438 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 439 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 440 441 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; } 442 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; } 443 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; } 444 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; } 445 446 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 447 448 st_dec_output = svcreate3_u8(output0, output1, output2); 449 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 450 451 pred_count = svcntp_b8(pred, pred); 452 *src += pred_count * 4; 453 *dst += pred_count * 3; 454 *src_len -= pred_count * 4; 455 i += pred_count * 4; 456 457 } 458 } else if (vl >= 128) { 459 pred = svwhilelt_b8((uint64_t)0, (uint64_t)128); 460 tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0); 461 462 while (i < N) { 463 pred = svwhilelt_b8(i / 4, N / 4); 464 465 ld_dec_input = svld4_u8(pred, *src); 466 467 str0 = svget4_u8(ld_dec_input, 0); 468 str1 = svget4_u8(ld_dec_input, 1); 469 str2 = svget4_u8(ld_dec_input, 2); 470 str3 = svget4_u8(ld_dec_input, 3); 471 472 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; } 473 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; } 474 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; } 475 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; } 476 477 temp0 = svtbl_u8(tbl_dec0, str0); 478 temp1 = svtbl_u8(tbl_dec0, str1); 479 temp2 = svtbl_u8(tbl_dec0, str2); 480 temp3 = svtbl_u8(tbl_dec0, str3); 481 482 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; } 483 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; } 484 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; } 485 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; } 486 487 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2); 488 489 st_dec_output = svcreate3_u8(output0, output1, output2); 490 svst3_u8(pred, (uint8_t *)*dst, st_dec_output); 491 492 pred_count = svcntp_b8(pred, pred); 493 *src += pred_count * 4; 494 *dst += pred_count * 3; 495 *src_len -= pred_count * 4; 496 i += pred_count * 4; 497 498 } 499 } 500 } 501