1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2022 Marvell. 3 */ 4 5 #include <errno.h> 6 #include <stdint.h> 7 #include <stdlib.h> 8 9 #include "mldev_utils.h" 10 11 #include <arm_neon.h> 12 13 /* Description: 14 * This file implements vector versions of Machine Learning utility functions used to convert data 15 * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16 * is based on Arm Neon intrinsics. 17 */ 18 19 static inline void 20 __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output) 21 { 22 int16x4_t s16x4_l; 23 int16x4_t s16x4_h; 24 float32x4_t f32x4; 25 int16x8_t s16x8; 26 int32x4_t s32x4; 27 int8x8_t s8x8; 28 29 /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30 * Use round to nearest with ties away rounding mode. 31 */ 32 f32x4 = vld1q_f32(input); 33 f32x4 = vmulq_n_f32(f32x4, scale); 34 s32x4 = vcvtaq_s32_f32(f32x4); 35 s16x4_l = vqmovn_s32(s32x4); 36 37 /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 38 * Use round to nearest with ties away rounding mode. 39 */ 40 f32x4 = vld1q_f32(input + 4); 41 f32x4 = vmulq_n_f32(f32x4, scale); 42 s32x4 = vcvtaq_s32_f32(f32x4); 43 s16x4_h = vqmovn_s32(s32x4); 44 45 /* combine lower and higher int16x4_t to int16x8_t */ 46 s16x8 = vcombine_s16(s16x4_l, s16x4_h); 47 48 /* narrow to int8_t */ 49 s8x8 = vqmovn_s16(s16x8); 50 51 /* store 8 elements */ 52 vst1_s8(output, s8x8); 53 } 54 55 static inline void 56 __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output) 57 { 58 int32_t s32; 59 int16_t s16; 60 61 /* scale and convert, round to nearest with ties away rounding mode */ 62 s32 = vcvtas_s32_f32(scale * (*input)); 63 64 /* saturate narrow */ 65 s16 = vqmovns_s32(s32); 66 67 /* convert to int8_t */ 68 *output = vqmovnh_s16(s16); 69 } 70 71 int 72 rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output) 73 { 74 float *input_buffer; 75 int8_t *output_buffer; 76 uint64_t nb_iterations; 77 uint32_t vlen; 78 uint64_t i; 79 80 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 81 return -EINVAL; 82 83 input_buffer = (float *)input; 84 output_buffer = (int8_t *)output; 85 vlen = 2 * sizeof(float) / sizeof(int8_t); 86 nb_iterations = nb_elements / vlen; 87 88 /* convert vlen elements in each iteration */ 89 for (i = 0; i < nb_iterations; i++) { 90 __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer); 91 input_buffer += vlen; 92 output_buffer += vlen; 93 } 94 95 /* convert leftover elements */ 96 i = i * vlen; 97 for (; i < nb_elements; i++) { 98 __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer); 99 input_buffer++; 100 output_buffer++; 101 } 102 103 return 0; 104 } 105 106 static inline void 107 __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output) 108 { 109 float32x4_t f32x4; 110 int16x8_t s16x8; 111 int16x4_t s16x4; 112 int32x4_t s32x4; 113 int8x8_t s8x8; 114 115 /* load 8 x int8_t elements */ 116 s8x8 = vld1_s8(input); 117 118 /* widen int8_t to int16_t */ 119 s16x8 = vmovl_s8(s8x8); 120 121 /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 122 s16x4 = vget_low_s16(s16x8); 123 s32x4 = vmovl_s16(s16x4); 124 f32x4 = vcvtq_f32_s32(s32x4); 125 f32x4 = vmulq_n_f32(f32x4, scale); 126 vst1q_f32(output, f32x4); 127 128 /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 129 s16x4 = vget_high_s16(s16x8); 130 s32x4 = vmovl_s16(s16x4); 131 f32x4 = vcvtq_f32_s32(s32x4); 132 f32x4 = vmulq_n_f32(f32x4, scale); 133 vst1q_f32(output + 4, f32x4); 134 } 135 136 static inline void 137 __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output) 138 { 139 *output = scale * vcvts_f32_s32((int32_t)*input); 140 } 141 142 int 143 rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 144 { 145 int8_t *input_buffer; 146 float *output_buffer; 147 uint64_t nb_iterations; 148 uint32_t vlen; 149 uint64_t i; 150 151 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 152 return -EINVAL; 153 154 input_buffer = (int8_t *)input; 155 output_buffer = (float *)output; 156 vlen = 2 * sizeof(float) / sizeof(int8_t); 157 nb_iterations = nb_elements / vlen; 158 159 /* convert vlen elements in each iteration */ 160 for (i = 0; i < nb_iterations; i++) { 161 __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 162 input_buffer += vlen; 163 output_buffer += vlen; 164 } 165 166 /* convert leftover elements */ 167 i = i * vlen; 168 for (; i < nb_elements; i++) { 169 __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 170 input_buffer++; 171 output_buffer++; 172 } 173 174 return 0; 175 } 176 177 static inline void 178 __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output) 179 { 180 uint16x4_t u16x4_l; 181 uint16x4_t u16x4_h; 182 float32x4_t f32x4; 183 uint32x4_t u32x4; 184 uint16x8_t u16x8; 185 uint8x8_t u8x8; 186 187 /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 188 * use round to nearest with ties away rounding mode. 189 */ 190 f32x4 = vld1q_f32(input); 191 f32x4 = vmulq_n_f32(f32x4, scale); 192 u32x4 = vcvtaq_u32_f32(f32x4); 193 u16x4_l = vqmovn_u32(u32x4); 194 195 /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 196 * use round to nearest with ties away rounding mode. 197 */ 198 f32x4 = vld1q_f32(input + 4); 199 f32x4 = vmulq_n_f32(f32x4, scale); 200 u32x4 = vcvtaq_u32_f32(f32x4); 201 u16x4_h = vqmovn_u32(u32x4); 202 203 /* combine lower and higher uint16x4_t */ 204 u16x8 = vcombine_u16(u16x4_l, u16x4_h); 205 206 /* narrow to uint8x8_t */ 207 u8x8 = vqmovn_u16(u16x8); 208 209 /* store 8 elements */ 210 vst1_u8(output, u8x8); 211 } 212 213 static inline void 214 __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output) 215 { 216 uint32_t u32; 217 uint16_t u16; 218 219 /* scale and convert, round to nearest with ties away rounding mode */ 220 u32 = vcvtas_u32_f32(scale * (*input)); 221 222 /* saturate narrow */ 223 u16 = vqmovns_u32(u32); 224 225 /* convert to uint8_t */ 226 *output = vqmovnh_u16(u16); 227 } 228 229 int 230 rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output) 231 { 232 float *input_buffer; 233 uint8_t *output_buffer; 234 uint64_t nb_iterations; 235 uint32_t vlen; 236 uint64_t i; 237 238 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 239 return -EINVAL; 240 241 input_buffer = (float *)input; 242 output_buffer = (uint8_t *)output; 243 vlen = 2 * sizeof(float) / sizeof(uint8_t); 244 nb_iterations = nb_elements / vlen; 245 246 /* convert vlen elements in each iteration */ 247 for (i = 0; i < nb_iterations; i++) { 248 __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer); 249 input_buffer += vlen; 250 output_buffer += vlen; 251 } 252 253 /* convert leftover elements */ 254 i = i * vlen; 255 for (; i < nb_elements; i++) { 256 __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer); 257 input_buffer++; 258 output_buffer++; 259 } 260 261 return 0; 262 } 263 264 static inline void 265 __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output) 266 { 267 float32x4_t f32x4; 268 uint16x8_t u16x8; 269 uint16x4_t u16x4; 270 uint32x4_t u32x4; 271 uint8x8_t u8x8; 272 273 /* load 8 x uint8_t elements */ 274 u8x8 = vld1_u8(input); 275 276 /* widen uint8_t to uint16_t */ 277 u16x8 = vmovl_u8(u8x8); 278 279 /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 280 u16x4 = vget_low_u16(u16x8); 281 u32x4 = vmovl_u16(u16x4); 282 f32x4 = vcvtq_f32_u32(u32x4); 283 f32x4 = vmulq_n_f32(f32x4, scale); 284 vst1q_f32(output, f32x4); 285 286 /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 287 u16x4 = vget_high_u16(u16x8); 288 u32x4 = vmovl_u16(u16x4); 289 f32x4 = vcvtq_f32_u32(u32x4); 290 f32x4 = vmulq_n_f32(f32x4, scale); 291 vst1q_f32(output + 4, f32x4); 292 } 293 294 static inline void 295 __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output) 296 { 297 *output = scale * vcvts_f32_u32((uint32_t)*input); 298 } 299 300 int 301 rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 302 { 303 uint8_t *input_buffer; 304 float *output_buffer; 305 uint64_t nb_iterations; 306 uint64_t vlen; 307 uint64_t i; 308 309 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 310 return -EINVAL; 311 312 input_buffer = (uint8_t *)input; 313 output_buffer = (float *)output; 314 vlen = 2 * sizeof(float) / sizeof(uint8_t); 315 nb_iterations = nb_elements / vlen; 316 317 /* convert vlen elements in each iteration */ 318 for (i = 0; i < nb_iterations; i++) { 319 __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 320 input_buffer += vlen; 321 output_buffer += vlen; 322 } 323 324 /* convert leftover elements */ 325 i = i * vlen; 326 for (; i < nb_elements; i++) { 327 __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 328 input_buffer++; 329 output_buffer++; 330 } 331 332 return 0; 333 } 334 335 static inline void 336 __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output) 337 { 338 float32x4_t f32x4; 339 int16x4_t s16x4; 340 int32x4_t s32x4; 341 342 /* load 4 x float elements */ 343 f32x4 = vld1q_f32(input); 344 345 /* scale */ 346 f32x4 = vmulq_n_f32(f32x4, scale); 347 348 /* convert to int32x4_t using round to nearest with ties away rounding mode */ 349 s32x4 = vcvtaq_s32_f32(f32x4); 350 351 /* saturate narrow to int16x4_t */ 352 s16x4 = vqmovn_s32(s32x4); 353 354 /* store 4 elements */ 355 vst1_s16(output, s16x4); 356 } 357 358 static inline void 359 __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output) 360 { 361 int32_t s32; 362 363 /* scale and convert, round to nearest with ties away rounding mode */ 364 s32 = vcvtas_s32_f32(scale * (*input)); 365 366 /* saturate narrow */ 367 *output = vqmovns_s32(s32); 368 } 369 370 int 371 rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output) 372 { 373 float *input_buffer; 374 int16_t *output_buffer; 375 uint64_t nb_iterations; 376 uint32_t vlen; 377 uint64_t i; 378 379 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 380 return -EINVAL; 381 382 input_buffer = (float *)input; 383 output_buffer = (int16_t *)output; 384 vlen = 2 * sizeof(float) / sizeof(int16_t); 385 nb_iterations = nb_elements / vlen; 386 387 /* convert vlen elements in each iteration */ 388 for (i = 0; i < nb_iterations; i++) { 389 __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer); 390 input_buffer += vlen; 391 output_buffer += vlen; 392 } 393 394 /* convert leftover elements */ 395 i = i * vlen; 396 for (; i < nb_elements; i++) { 397 __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer); 398 input_buffer++; 399 output_buffer++; 400 } 401 402 return 0; 403 } 404 405 static inline void 406 __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output) 407 { 408 float32x4_t f32x4; 409 int16x4_t s16x4; 410 int32x4_t s32x4; 411 412 /* load 4 x int16_t elements */ 413 s16x4 = vld1_s16(input); 414 415 /* widen int16_t to int32_t */ 416 s32x4 = vmovl_s16(s16x4); 417 418 /* convert int32_t to float */ 419 f32x4 = vcvtq_f32_s32(s32x4); 420 421 /* scale */ 422 f32x4 = vmulq_n_f32(f32x4, scale); 423 424 /* store float32x4_t */ 425 vst1q_f32(output, f32x4); 426 } 427 428 static inline void 429 __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output) 430 { 431 *output = scale * vcvts_f32_s32((int32_t)*input); 432 } 433 434 int 435 rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 436 { 437 int16_t *input_buffer; 438 float *output_buffer; 439 uint64_t nb_iterations; 440 uint32_t vlen; 441 uint64_t i; 442 443 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 444 return -EINVAL; 445 446 input_buffer = (int16_t *)input; 447 output_buffer = (float *)output; 448 vlen = 2 * sizeof(float) / sizeof(int16_t); 449 nb_iterations = nb_elements / vlen; 450 451 /* convert vlen elements in each iteration */ 452 for (i = 0; i < nb_iterations; i++) { 453 __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 454 input_buffer += vlen; 455 output_buffer += vlen; 456 } 457 458 /* convert leftover elements */ 459 i = i * vlen; 460 for (; i < nb_elements; i++) { 461 __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 462 input_buffer++; 463 output_buffer++; 464 } 465 466 return 0; 467 } 468 469 static inline void 470 __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output) 471 { 472 float32x4_t f32x4; 473 uint16x4_t u16x4; 474 uint32x4_t u32x4; 475 476 /* load 4 float elements */ 477 f32x4 = vld1q_f32(input); 478 479 /* scale */ 480 f32x4 = vmulq_n_f32(f32x4, scale); 481 482 /* convert using round to nearest with ties to away rounding mode */ 483 u32x4 = vcvtaq_u32_f32(f32x4); 484 485 /* saturate narrow */ 486 u16x4 = vqmovn_u32(u32x4); 487 488 /* store 4 elements */ 489 vst1_u16(output, u16x4); 490 } 491 492 static inline void 493 __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output) 494 { 495 uint32_t u32; 496 497 /* scale and convert, round to nearest with ties away rounding mode */ 498 u32 = vcvtas_u32_f32(scale * (*input)); 499 500 /* saturate narrow */ 501 *output = vqmovns_u32(u32); 502 } 503 504 int 505 rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output) 506 { 507 float *input_buffer; 508 uint16_t *output_buffer; 509 uint64_t nb_iterations; 510 uint64_t vlen; 511 uint64_t i; 512 513 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 514 return -EINVAL; 515 516 input_buffer = (float *)input; 517 output_buffer = (uint16_t *)output; 518 vlen = 2 * sizeof(float) / sizeof(uint16_t); 519 nb_iterations = nb_elements / vlen; 520 521 /* convert vlen elements in each iteration */ 522 for (i = 0; i < nb_iterations; i++) { 523 __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer); 524 input_buffer += vlen; 525 output_buffer += vlen; 526 } 527 528 /* convert leftover elements */ 529 i = i * vlen; 530 for (; i < nb_elements; i++) { 531 __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer); 532 input_buffer++; 533 output_buffer++; 534 } 535 536 return 0; 537 } 538 539 static inline void 540 __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output) 541 { 542 float32x4_t f32x4; 543 uint16x4_t u16x4; 544 uint32x4_t u32x4; 545 546 /* load 4 x uint16_t elements */ 547 u16x4 = vld1_u16(input); 548 549 /* widen uint16_t to uint32_t */ 550 u32x4 = vmovl_u16(u16x4); 551 552 /* convert uint32_t to float */ 553 f32x4 = vcvtq_f32_u32(u32x4); 554 555 /* scale */ 556 f32x4 = vmulq_n_f32(f32x4, scale); 557 558 /* store float32x4_t */ 559 vst1q_f32(output, f32x4); 560 } 561 562 static inline void 563 __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output) 564 { 565 *output = scale * vcvts_f32_u32((uint32_t)*input); 566 } 567 568 int 569 rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 570 { 571 uint16_t *input_buffer; 572 float *output_buffer; 573 uint64_t nb_iterations; 574 uint32_t vlen; 575 uint64_t i; 576 577 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 578 return -EINVAL; 579 580 input_buffer = (uint16_t *)input; 581 output_buffer = (float *)output; 582 vlen = 2 * sizeof(float) / sizeof(uint16_t); 583 nb_iterations = nb_elements / vlen; 584 585 /* convert vlen elements in each iteration */ 586 for (i = 0; i < nb_iterations; i++) { 587 __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 588 input_buffer += vlen; 589 output_buffer += vlen; 590 } 591 592 /* convert leftover elements */ 593 i = i * vlen; 594 for (; i < nb_elements; i++) { 595 __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 596 input_buffer++; 597 output_buffer++; 598 } 599 600 return 0; 601 } 602 603 static inline void 604 __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output) 605 { 606 float32x4_t f32x4; 607 float16x4_t f16x4; 608 609 /* load 4 x float32_t elements */ 610 f32x4 = vld1q_f32(input); 611 612 /* convert to float16x4_t */ 613 f16x4 = vcvt_f16_f32(f32x4); 614 615 /* store float16x4_t */ 616 vst1_f16(output, f16x4); 617 } 618 619 static inline void 620 __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output) 621 { 622 float32x4_t f32x4; 623 float16x4_t f16x4; 624 625 /* load element to 4 lanes */ 626 f32x4 = vld1q_dup_f32(input); 627 628 /* convert float32_t to float16_t */ 629 f16x4 = vcvt_f16_f32(f32x4); 630 631 /* store lane 0 / 1 element */ 632 vst1_lane_f16(output, f16x4, 0); 633 } 634 635 int 636 rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output) 637 { 638 float32_t *input_buffer; 639 float16_t *output_buffer; 640 uint64_t nb_iterations; 641 uint32_t vlen; 642 uint64_t i; 643 644 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 645 return -EINVAL; 646 647 input_buffer = (float32_t *)input; 648 output_buffer = (float16_t *)output; 649 vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 650 nb_iterations = nb_elements / vlen; 651 652 /* convert vlen elements in each iteration */ 653 for (i = 0; i < nb_iterations; i++) { 654 __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 655 input_buffer += vlen; 656 output_buffer += vlen; 657 } 658 659 /* convert leftover elements */ 660 i = i * vlen; 661 for (; i < nb_elements; i++) { 662 __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 663 input_buffer++; 664 output_buffer++; 665 } 666 667 return 0; 668 } 669 670 static inline void 671 __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output) 672 { 673 float16x4_t f16x4; 674 float32x4_t f32x4; 675 676 /* load 4 x float16_t elements */ 677 f16x4 = vld1_f16(input); 678 679 /* convert float16x4_t to float32x4_t */ 680 f32x4 = vcvt_f32_f16(f16x4); 681 682 /* store float32x4_t */ 683 vst1q_f32(output, f32x4); 684 } 685 686 static inline void 687 __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output) 688 { 689 float16x4_t f16x4; 690 float32x4_t f32x4; 691 692 /* load element to 4 lanes */ 693 f16x4 = vld1_dup_f16(input); 694 695 /* convert float16_t to float32_t */ 696 f32x4 = vcvt_f32_f16(f16x4); 697 698 /* store 1 element */ 699 vst1q_lane_f32(output, f32x4, 0); 700 } 701 702 int 703 rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output) 704 { 705 float16_t *input_buffer; 706 float32_t *output_buffer; 707 uint64_t nb_iterations; 708 uint32_t vlen; 709 uint64_t i; 710 711 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 712 return -EINVAL; 713 714 input_buffer = (float16_t *)input; 715 output_buffer = (float32_t *)output; 716 vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 717 nb_iterations = nb_elements / vlen; 718 719 /* convert vlen elements in each iteration */ 720 for (i = 0; i < nb_iterations; i++) { 721 __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 722 input_buffer += vlen; 723 output_buffer += vlen; 724 } 725 726 /* convert leftover elements */ 727 i = i * vlen; 728 for (; i < nb_elements; i++) { 729 __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 730 input_buffer++; 731 output_buffer++; 732 } 733 734 return 0; 735 } 736