1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2022 Marvell. 3 */ 4 5 #include <errno.h> 6 #include <stdint.h> 7 #include <stdlib.h> 8 9 #include "mldev_utils.h" 10 11 #include <arm_neon.h> 12 13 /* Description: 14 * This file implements vector versions of Machine Learning utility functions used to convert data 15 * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16 * is based on Arm Neon intrinsics. 17 */ 18 19 static inline void 20 __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output) 21 { 22 int16x4_t s16x4_l; 23 int16x4_t s16x4_h; 24 float32x4_t f32x4; 25 int16x8_t s16x8; 26 int32x4_t s32x4; 27 int8x8_t s8x8; 28 29 /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30 * Use round to nearest with ties away rounding mode. 31 */ 32 f32x4 = vld1q_f32(input); 33 f32x4 = vmulq_n_f32(f32x4, scale); 34 s32x4 = vcvtaq_s32_f32(f32x4); 35 s16x4_l = vqmovn_s32(s32x4); 36 37 /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 38 * Use round to nearest with ties away rounding mode. 39 */ 40 f32x4 = vld1q_f32(input + 4); 41 f32x4 = vmulq_n_f32(f32x4, scale); 42 s32x4 = vcvtaq_s32_f32(f32x4); 43 s16x4_h = vqmovn_s32(s32x4); 44 45 /* combine lower and higher int16x4_t to int16x8_t */ 46 s16x8 = vcombine_s16(s16x4_l, s16x4_h); 47 48 /* narrow to int8_t */ 49 s8x8 = vqmovn_s16(s16x8); 50 51 /* store 8 elements */ 52 vst1_s8(output, s8x8); 53 } 54 55 static inline void 56 __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output) 57 { 58 int32_t s32; 59 int16_t s16; 60 61 /* scale and convert, round to nearest with ties away rounding mode */ 62 s32 = vcvtas_s32_f32(scale * (*input)); 63 64 /* saturate narrow */ 65 s16 = vqmovns_s32(s32); 66 67 /* convert to int8_t */ 68 *output = vqmovnh_s16(s16); 69 } 70 71 int 72 rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output) 73 { 74 float *input_buffer; 75 int8_t *output_buffer; 76 uint64_t nb_iterations; 77 uint32_t vlen; 78 uint64_t i; 79 80 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 81 return -EINVAL; 82 83 input_buffer = (float *)input; 84 output_buffer = (int8_t *)output; 85 vlen = 2 * sizeof(float) / sizeof(int8_t); 86 nb_iterations = nb_elements / vlen; 87 88 /* convert vlen elements in each iteration */ 89 for (i = 0; i < nb_iterations; i++) { 90 __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer); 91 input_buffer += vlen; 92 output_buffer += vlen; 93 } 94 95 /* convert leftover elements */ 96 i = i * vlen; 97 for (; i < nb_elements; i++) { 98 __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer); 99 input_buffer++; 100 output_buffer++; 101 } 102 103 return 0; 104 } 105 106 static inline void 107 __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output) 108 { 109 float32x4_t f32x4; 110 int16x8_t s16x8; 111 int16x4_t s16x4; 112 int32x4_t s32x4; 113 int8x8_t s8x8; 114 115 /* load 8 x int8_t elements */ 116 s8x8 = vld1_s8(input); 117 118 /* widen int8_t to int16_t */ 119 s16x8 = vmovl_s8(s8x8); 120 121 /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 122 s16x4 = vget_low_s16(s16x8); 123 s32x4 = vmovl_s16(s16x4); 124 f32x4 = vcvtq_f32_s32(s32x4); 125 f32x4 = vmulq_n_f32(f32x4, scale); 126 vst1q_f32(output, f32x4); 127 128 /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 129 s16x4 = vget_high_s16(s16x8); 130 s32x4 = vmovl_s16(s16x4); 131 f32x4 = vcvtq_f32_s32(s32x4); 132 f32x4 = vmulq_n_f32(f32x4, scale); 133 vst1q_f32(output + 4, f32x4); 134 } 135 136 static inline void 137 __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output) 138 { 139 *output = scale * vcvts_f32_s32((int32_t)*input); 140 } 141 142 int 143 rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 144 { 145 int8_t *input_buffer; 146 float *output_buffer; 147 uint64_t nb_iterations; 148 uint32_t vlen; 149 uint64_t i; 150 151 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 152 return -EINVAL; 153 154 input_buffer = (int8_t *)input; 155 output_buffer = (float *)output; 156 vlen = 2 * sizeof(float) / sizeof(int8_t); 157 nb_iterations = nb_elements / vlen; 158 159 /* convert vlen elements in each iteration */ 160 for (i = 0; i < nb_iterations; i++) { 161 __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 162 input_buffer += vlen; 163 output_buffer += vlen; 164 } 165 166 /* convert leftover elements */ 167 i = i * vlen; 168 for (; i < nb_elements; i++) { 169 __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 170 input_buffer++; 171 output_buffer++; 172 } 173 174 return 0; 175 } 176 177 static inline void 178 __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output) 179 { 180 uint16x4_t u16x4_l; 181 uint16x4_t u16x4_h; 182 float32x4_t f32x4; 183 uint32x4_t u32x4; 184 uint16x8_t u16x8; 185 uint8x8_t u8x8; 186 187 /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 188 * use round to nearest with ties away rounding mode. 189 */ 190 f32x4 = vld1q_f32(input); 191 f32x4 = vmulq_n_f32(f32x4, scale); 192 u32x4 = vcvtaq_u32_f32(f32x4); 193 u16x4_l = vqmovn_u32(u32x4); 194 195 /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 196 * use round to nearest with ties away rounding mode. 197 */ 198 f32x4 = vld1q_f32(input + 4); 199 f32x4 = vmulq_n_f32(f32x4, scale); 200 u32x4 = vcvtaq_u32_f32(f32x4); 201 u16x4_h = vqmovn_u32(u32x4); 202 203 /* combine lower and higher uint16x4_t */ 204 u16x8 = vcombine_u16(u16x4_l, u16x4_h); 205 206 /* narrow to uint8x8_t */ 207 u8x8 = vqmovn_u16(u16x8); 208 209 /* store 8 elements */ 210 vst1_u8(output, u8x8); 211 } 212 213 static inline void 214 __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output) 215 { 216 uint32_t u32; 217 uint16_t u16; 218 219 /* scale and convert, round to nearest with ties away rounding mode */ 220 u32 = vcvtas_u32_f32(scale * (*input)); 221 222 /* saturate narrow */ 223 u16 = vqmovns_u32(u32); 224 225 /* convert to uint8_t */ 226 *output = vqmovnh_u16(u16); 227 } 228 229 int 230 rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output) 231 { 232 float *input_buffer; 233 uint8_t *output_buffer; 234 uint64_t nb_iterations; 235 uint32_t vlen; 236 uint64_t i; 237 238 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 239 return -EINVAL; 240 241 input_buffer = (float *)input; 242 output_buffer = (uint8_t *)output; 243 vlen = 2 * sizeof(float) / sizeof(uint8_t); 244 nb_iterations = nb_elements / vlen; 245 246 /* convert vlen elements in each iteration */ 247 for (i = 0; i < nb_iterations; i++) { 248 __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer); 249 input_buffer += vlen; 250 output_buffer += vlen; 251 } 252 253 /* convert leftover elements */ 254 i = i * vlen; 255 for (; i < nb_elements; i++) { 256 __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer); 257 input_buffer++; 258 output_buffer++; 259 } 260 261 return 0; 262 } 263 264 static inline void 265 __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output) 266 { 267 float32x4_t f32x4; 268 uint16x8_t u16x8; 269 uint16x4_t u16x4; 270 uint32x4_t u32x4; 271 uint8x8_t u8x8; 272 273 /* load 8 x uint8_t elements */ 274 u8x8 = vld1_u8(input); 275 276 /* widen uint8_t to uint16_t */ 277 u16x8 = vmovl_u8(u8x8); 278 279 /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 280 u16x4 = vget_low_u16(u16x8); 281 u32x4 = vmovl_u16(u16x4); 282 f32x4 = vcvtq_f32_u32(u32x4); 283 f32x4 = vmulq_n_f32(f32x4, scale); 284 vst1q_f32(output, f32x4); 285 286 /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 287 u16x4 = vget_high_u16(u16x8); 288 u32x4 = vmovl_u16(u16x4); 289 f32x4 = vcvtq_f32_u32(u32x4); 290 f32x4 = vmulq_n_f32(f32x4, scale); 291 vst1q_f32(output + 4, f32x4); 292 } 293 294 static inline void 295 __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output) 296 { 297 *output = scale * vcvts_f32_u32((uint32_t)*input); 298 } 299 300 int 301 rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 302 { 303 uint8_t *input_buffer; 304 float *output_buffer; 305 uint64_t nb_iterations; 306 uint64_t vlen; 307 uint64_t i; 308 309 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 310 return -EINVAL; 311 312 input_buffer = (uint8_t *)input; 313 output_buffer = (float *)output; 314 vlen = 2 * sizeof(float) / sizeof(uint8_t); 315 nb_iterations = nb_elements / vlen; 316 317 /* convert vlen elements in each iteration */ 318 for (i = 0; i < nb_iterations; i++) { 319 __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 320 input_buffer += vlen; 321 output_buffer += vlen; 322 } 323 324 /* convert leftover elements */ 325 i = i * vlen; 326 for (; i < nb_elements; i++) { 327 __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 328 input_buffer++; 329 output_buffer++; 330 } 331 332 return 0; 333 } 334 335 static inline void 336 __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output) 337 { 338 float32x4_t f32x4; 339 int16x4_t s16x4; 340 int32x4_t s32x4; 341 342 /* load 4 x float elements */ 343 f32x4 = vld1q_f32(input); 344 345 /* scale */ 346 f32x4 = vmulq_n_f32(f32x4, scale); 347 348 /* convert to int32x4_t using round to nearest with ties away rounding mode */ 349 s32x4 = vcvtaq_s32_f32(f32x4); 350 351 /* saturate narrow to int16x4_t */ 352 s16x4 = vqmovn_s32(s32x4); 353 354 /* store 4 elements */ 355 vst1_s16(output, s16x4); 356 } 357 358 static inline void 359 __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output) 360 { 361 int32_t s32; 362 363 /* scale and convert, round to nearest with ties away rounding mode */ 364 s32 = vcvtas_s32_f32(scale * (*input)); 365 366 /* saturate narrow */ 367 *output = vqmovns_s32(s32); 368 } 369 370 int 371 rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output) 372 { 373 float *input_buffer; 374 int16_t *output_buffer; 375 uint64_t nb_iterations; 376 uint32_t vlen; 377 uint64_t i; 378 379 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 380 return -EINVAL; 381 382 input_buffer = (float *)input; 383 output_buffer = (int16_t *)output; 384 vlen = 2 * sizeof(float) / sizeof(int16_t); 385 nb_iterations = nb_elements / vlen; 386 387 /* convert vlen elements in each iteration */ 388 for (i = 0; i < nb_iterations; i++) { 389 __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer); 390 input_buffer += vlen; 391 output_buffer += vlen; 392 } 393 394 /* convert leftover elements */ 395 i = i * vlen; 396 for (; i < nb_elements; i++) { 397 __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer); 398 input_buffer++; 399 output_buffer++; 400 } 401 402 return 0; 403 } 404 405 static inline void 406 __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output) 407 { 408 float32x4_t f32x4; 409 int16x4_t s16x4; 410 int32x4_t s32x4; 411 412 /* load 4 x int16_t elements */ 413 s16x4 = vld1_s16(input); 414 415 /* widen int16_t to int32_t */ 416 s32x4 = vmovl_s16(s16x4); 417 418 /* convert int32_t to float */ 419 f32x4 = vcvtq_f32_s32(s32x4); 420 421 /* scale */ 422 f32x4 = vmulq_n_f32(f32x4, scale); 423 424 /* store float32x4_t */ 425 vst1q_f32(output, f32x4); 426 } 427 428 static inline void 429 __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output) 430 { 431 *output = scale * vcvts_f32_s32((int32_t)*input); 432 } 433 434 int 435 rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 436 { 437 int16_t *input_buffer; 438 float *output_buffer; 439 uint64_t nb_iterations; 440 uint32_t vlen; 441 uint64_t i; 442 443 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 444 return -EINVAL; 445 446 input_buffer = (int16_t *)input; 447 output_buffer = (float *)output; 448 vlen = 2 * sizeof(float) / sizeof(int16_t); 449 nb_iterations = nb_elements / vlen; 450 451 /* convert vlen elements in each iteration */ 452 for (i = 0; i < nb_iterations; i++) { 453 __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 454 input_buffer += vlen; 455 output_buffer += vlen; 456 } 457 458 /* convert leftover elements */ 459 i = i * vlen; 460 for (; i < nb_elements; i++) { 461 __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 462 input_buffer++; 463 output_buffer++; 464 } 465 466 return 0; 467 } 468 469 static inline void 470 __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output) 471 { 472 float32x4_t f32x4; 473 uint16x4_t u16x4; 474 uint32x4_t u32x4; 475 476 /* load 4 float elements */ 477 f32x4 = vld1q_f32(input); 478 479 /* scale */ 480 f32x4 = vmulq_n_f32(f32x4, scale); 481 482 /* convert using round to nearest with ties to away rounding mode */ 483 u32x4 = vcvtaq_u32_f32(f32x4); 484 485 /* saturate narrow */ 486 u16x4 = vqmovn_u32(u32x4); 487 488 /* store 4 elements */ 489 vst1_u16(output, u16x4); 490 } 491 492 static inline void 493 __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output) 494 { 495 uint32_t u32; 496 497 /* scale and convert, round to nearest with ties away rounding mode */ 498 u32 = vcvtas_u32_f32(scale * (*input)); 499 500 /* saturate narrow */ 501 *output = vqmovns_u32(u32); 502 } 503 504 int 505 rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output) 506 { 507 float *input_buffer; 508 uint16_t *output_buffer; 509 uint64_t nb_iterations; 510 uint64_t vlen; 511 uint64_t i; 512 513 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 514 return -EINVAL; 515 516 input_buffer = (float *)input; 517 output_buffer = (uint16_t *)output; 518 vlen = 2 * sizeof(float) / sizeof(uint16_t); 519 nb_iterations = nb_elements / vlen; 520 521 /* convert vlen elements in each iteration */ 522 for (i = 0; i < nb_iterations; i++) { 523 __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer); 524 input_buffer += vlen; 525 output_buffer += vlen; 526 } 527 528 /* convert leftover elements */ 529 i = i * vlen; 530 for (; i < nb_elements; i++) { 531 __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer); 532 input_buffer++; 533 output_buffer++; 534 } 535 536 return 0; 537 } 538 539 static inline void 540 __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output) 541 { 542 float32x4_t f32x4; 543 uint16x4_t u16x4; 544 uint32x4_t u32x4; 545 546 /* load 4 x uint16_t elements */ 547 u16x4 = vld1_u16(input); 548 549 /* widen uint16_t to uint32_t */ 550 u32x4 = vmovl_u16(u16x4); 551 552 /* convert uint32_t to float */ 553 f32x4 = vcvtq_f32_u32(u32x4); 554 555 /* scale */ 556 f32x4 = vmulq_n_f32(f32x4, scale); 557 558 /* store float32x4_t */ 559 vst1q_f32(output, f32x4); 560 } 561 562 static inline void 563 __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output) 564 { 565 *output = scale * vcvts_f32_u32((uint32_t)*input); 566 } 567 568 int 569 rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 570 { 571 uint16_t *input_buffer; 572 float *output_buffer; 573 uint64_t nb_iterations; 574 uint32_t vlen; 575 uint64_t i; 576 577 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 578 return -EINVAL; 579 580 input_buffer = (uint16_t *)input; 581 output_buffer = (float *)output; 582 vlen = 2 * sizeof(float) / sizeof(uint16_t); 583 nb_iterations = nb_elements / vlen; 584 585 /* convert vlen elements in each iteration */ 586 for (i = 0; i < nb_iterations; i++) { 587 __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 588 input_buffer += vlen; 589 output_buffer += vlen; 590 } 591 592 /* convert leftover elements */ 593 i = i * vlen; 594 for (; i < nb_elements; i++) { 595 __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 596 input_buffer++; 597 output_buffer++; 598 } 599 600 return 0; 601 } 602 603 static inline void 604 __float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output) 605 { 606 float32x4_t f32x4; 607 int32x4_t s32x4; 608 609 /* load 4 x float elements */ 610 f32x4 = vld1q_f32(input); 611 612 /* scale */ 613 f32x4 = vmulq_n_f32(f32x4, scale); 614 615 /* convert to int32x4_t using round to nearest with ties away rounding mode */ 616 s32x4 = vcvtaq_s32_f32(f32x4); 617 618 /* store 4 elements */ 619 vst1q_s32(output, s32x4); 620 } 621 622 static inline void 623 __float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output) 624 { 625 /* scale and convert, round to nearest with ties away rounding mode */ 626 *output = vcvtas_s32_f32(scale * (*input)); 627 } 628 629 int 630 rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, void *output) 631 { 632 float *input_buffer; 633 int32_t *output_buffer; 634 uint64_t nb_iterations; 635 uint32_t vlen; 636 uint64_t i; 637 638 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 639 return -EINVAL; 640 641 input_buffer = (float *)input; 642 output_buffer = (int32_t *)output; 643 vlen = 2 * sizeof(float) / sizeof(int32_t); 644 nb_iterations = nb_elements / vlen; 645 646 /* convert vlen elements in each iteration */ 647 for (i = 0; i < nb_iterations; i++) { 648 __float32_to_int32_neon_s32x4(scale, input_buffer, output_buffer); 649 input_buffer += vlen; 650 output_buffer += vlen; 651 } 652 653 /* convert leftover elements */ 654 i = i * vlen; 655 for (; i < nb_elements; i++) { 656 __float32_to_int32_neon_s32x1(scale, input_buffer, output_buffer); 657 input_buffer++; 658 output_buffer++; 659 } 660 661 return 0; 662 } 663 664 static inline void 665 __int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output) 666 { 667 float32x4_t f32x4; 668 int32x4_t s32x4; 669 670 /* load 4 x int32_t elements */ 671 s32x4 = vld1q_s32(input); 672 673 /* convert int32_t to float */ 674 f32x4 = vcvtq_f32_s32(s32x4); 675 676 /* scale */ 677 f32x4 = vmulq_n_f32(f32x4, scale); 678 679 /* store float32x4_t */ 680 vst1q_f32(output, f32x4); 681 } 682 683 static inline void 684 __int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output) 685 { 686 *output = scale * vcvts_f32_s32(*input); 687 } 688 689 int 690 rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 691 { 692 int32_t *input_buffer; 693 float *output_buffer; 694 uint64_t nb_iterations; 695 uint32_t vlen; 696 uint64_t i; 697 698 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 699 return -EINVAL; 700 701 input_buffer = (int32_t *)input; 702 output_buffer = (float *)output; 703 vlen = 2 * sizeof(float) / sizeof(int32_t); 704 nb_iterations = nb_elements / vlen; 705 706 /* convert vlen elements in each iteration */ 707 for (i = 0; i < nb_iterations; i++) { 708 __int32_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 709 input_buffer += vlen; 710 output_buffer += vlen; 711 } 712 713 /* convert leftover elements */ 714 i = i * vlen; 715 for (; i < nb_elements; i++) { 716 __int32_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 717 input_buffer++; 718 output_buffer++; 719 } 720 721 return 0; 722 } 723 724 static inline void 725 __float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output) 726 { 727 float32x4_t f32x4; 728 uint32x4_t u32x4; 729 730 /* load 4 float elements */ 731 f32x4 = vld1q_f32(input); 732 733 /* scale */ 734 f32x4 = vmulq_n_f32(f32x4, scale); 735 736 /* convert using round to nearest with ties to away rounding mode */ 737 u32x4 = vcvtaq_u32_f32(f32x4); 738 739 /* store 4 elements */ 740 vst1q_u32(output, u32x4); 741 } 742 743 static inline void 744 __float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output) 745 { 746 /* scale and convert, round to nearest with ties away rounding mode */ 747 *output = vcvtas_u32_f32(scale * (*input)); 748 } 749 750 int 751 rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, void *output) 752 { 753 float *input_buffer; 754 uint32_t *output_buffer; 755 uint64_t nb_iterations; 756 uint64_t vlen; 757 uint64_t i; 758 759 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 760 return -EINVAL; 761 762 input_buffer = (float *)input; 763 output_buffer = (uint32_t *)output; 764 vlen = 2 * sizeof(float) / sizeof(uint32_t); 765 nb_iterations = nb_elements / vlen; 766 767 /* convert vlen elements in each iteration */ 768 for (i = 0; i < nb_iterations; i++) { 769 __float32_to_uint32_neon_u32x4(scale, input_buffer, output_buffer); 770 input_buffer += vlen; 771 output_buffer += vlen; 772 } 773 774 /* convert leftover elements */ 775 i = i * vlen; 776 for (; i < nb_elements; i++) { 777 __float32_to_uint32_neon_u32x1(scale, input_buffer, output_buffer); 778 input_buffer++; 779 output_buffer++; 780 } 781 782 return 0; 783 } 784 785 static inline void 786 __uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output) 787 { 788 float32x4_t f32x4; 789 uint32x4_t u32x4; 790 791 /* load 4 x uint32_t elements */ 792 u32x4 = vld1q_u32(input); 793 794 /* convert uint32_t to float */ 795 f32x4 = vcvtq_f32_u32(u32x4); 796 797 /* scale */ 798 f32x4 = vmulq_n_f32(f32x4, scale); 799 800 /* store float32x4_t */ 801 vst1q_f32(output, f32x4); 802 } 803 804 static inline void 805 __uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output) 806 { 807 *output = scale * vcvts_f32_u32(*input); 808 } 809 810 int 811 rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 812 { 813 uint32_t *input_buffer; 814 float *output_buffer; 815 uint64_t nb_iterations; 816 uint32_t vlen; 817 uint64_t i; 818 819 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 820 return -EINVAL; 821 822 input_buffer = (uint32_t *)input; 823 output_buffer = (float *)output; 824 vlen = 2 * sizeof(float) / sizeof(uint32_t); 825 nb_iterations = nb_elements / vlen; 826 827 /* convert vlen elements in each iteration */ 828 for (i = 0; i < nb_iterations; i++) { 829 __uint32_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 830 input_buffer += vlen; 831 output_buffer += vlen; 832 } 833 834 /* convert leftover elements */ 835 i = i * vlen; 836 for (; i < nb_elements; i++) { 837 __uint32_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 838 input_buffer++; 839 output_buffer++; 840 } 841 842 return 0; 843 } 844 845 static inline void 846 __float32_to_int64_neon_s64x2(float scale, float *input, int64_t *output) 847 { 848 float32x2_t f32x2; 849 float64x2_t f64x2; 850 int64x2_t s64x2; 851 852 /* load 2 x float elements */ 853 f32x2 = vld1_f32(input); 854 855 /* scale */ 856 f32x2 = vmul_n_f32(f32x2, scale); 857 858 /* convert to float64x2_t */ 859 f64x2 = vcvt_f64_f32(f32x2); 860 861 /* convert to int64x2_t */ 862 s64x2 = vcvtaq_s64_f64(f64x2); 863 864 /* store 2 elements */ 865 vst1q_s64(output, s64x2); 866 } 867 868 static inline void 869 __float32_to_int64_neon_s64x1(float scale, float *input, int64_t *output) 870 { 871 float32x2_t f32x2; 872 float64x2_t f64x2; 873 int64x2_t s64x2; 874 875 /* load 1 x float element */ 876 f32x2 = vdup_n_f32(*input); 877 878 /* scale */ 879 f32x2 = vmul_n_f32(f32x2, scale); 880 881 /* convert to float64x2_t */ 882 f64x2 = vcvt_f64_f32(f32x2); 883 884 /* convert to int64x2_t */ 885 s64x2 = vcvtaq_s64_f64(f64x2); 886 887 /* store lane 0 of int64x2_t */ 888 vst1q_lane_s64(output, s64x2, 0); 889 } 890 891 int 892 rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, void *output) 893 { 894 float *input_buffer; 895 int64_t *output_buffer; 896 uint64_t nb_iterations; 897 uint32_t vlen; 898 uint64_t i; 899 900 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 901 return -EINVAL; 902 903 input_buffer = (float *)input; 904 output_buffer = (int64_t *)output; 905 vlen = 4 * sizeof(float) / sizeof(int64_t); 906 nb_iterations = nb_elements / vlen; 907 908 /* convert vlen elements in each iteration */ 909 for (i = 0; i < nb_iterations; i++) { 910 __float32_to_int64_neon_s64x2(scale, input_buffer, output_buffer); 911 input_buffer += vlen; 912 output_buffer += vlen; 913 } 914 915 /* convert leftover elements */ 916 i = i * vlen; 917 for (; i < nb_elements; i++) { 918 __float32_to_int64_neon_s64x1(scale, input_buffer, output_buffer); 919 input_buffer++; 920 output_buffer++; 921 } 922 923 return 0; 924 } 925 926 static inline void 927 __int64_to_float32_neon_f32x2(float scale, int64_t *input, float *output) 928 { 929 int64x2_t s64x2; 930 float64x2_t f64x2; 931 float32x2_t f32x2; 932 933 /* load 2 x int64_t elements */ 934 s64x2 = vld1q_s64(input); 935 936 /* convert int64x2_t to float64x2_t */ 937 f64x2 = vcvtq_f64_s64(s64x2); 938 939 /* convert float64x2_t to float32x2_t */ 940 f32x2 = vcvt_f32_f64(f64x2); 941 942 /* scale */ 943 f32x2 = vmul_n_f32(f32x2, scale); 944 945 /* store float32x2_t */ 946 vst1_f32(output, f32x2); 947 } 948 949 static inline void 950 __int64_to_float32_neon_f32x1(float scale, int64_t *input, float *output) 951 { 952 int64x2_t s64x2; 953 float64x2_t f64x2; 954 float32x2_t f32x2; 955 956 /* load 2 x int64_t elements */ 957 s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0); 958 959 /* convert int64x2_t to float64x2_t */ 960 f64x2 = vcvtq_f64_s64(s64x2); 961 962 /* convert float64x2_t to float32x2_t */ 963 f32x2 = vcvt_f32_f64(f64x2); 964 965 /* scale */ 966 f32x2 = vmul_n_f32(f32x2, scale); 967 968 /* store float32x2_t */ 969 vst1_lane_f32(output, f32x2, 0); 970 } 971 972 int 973 rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 974 { 975 int64_t *input_buffer; 976 float *output_buffer; 977 uint64_t nb_iterations; 978 uint32_t vlen; 979 uint64_t i; 980 981 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 982 return -EINVAL; 983 984 input_buffer = (int64_t *)input; 985 output_buffer = (float *)output; 986 vlen = 4 * sizeof(float) / sizeof(int64_t); 987 nb_iterations = nb_elements / vlen; 988 989 /* convert vlen elements in each iteration */ 990 for (i = 0; i < nb_iterations; i++) { 991 __int64_to_float32_neon_f32x2(scale, input_buffer, output_buffer); 992 input_buffer += vlen; 993 output_buffer += vlen; 994 } 995 996 /* convert leftover elements */ 997 i = i * vlen; 998 for (; i < nb_elements; i++) { 999 __int64_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 1000 input_buffer++; 1001 output_buffer++; 1002 } 1003 1004 return 0; 1005 } 1006 1007 static inline void 1008 __float32_to_uint64_neon_u64x2(float scale, float *input, uint64_t *output) 1009 { 1010 float32x2_t f32x2; 1011 float64x2_t f64x2; 1012 uint64x2_t u64x2; 1013 1014 /* load 2 x float elements */ 1015 f32x2 = vld1_f32(input); 1016 1017 /* scale */ 1018 f32x2 = vmul_n_f32(f32x2, scale); 1019 1020 /* convert to float64x2_t */ 1021 f64x2 = vcvt_f64_f32(f32x2); 1022 1023 /* convert to int64x2_t */ 1024 u64x2 = vcvtaq_u64_f64(f64x2); 1025 1026 /* store 2 elements */ 1027 vst1q_u64(output, u64x2); 1028 } 1029 1030 static inline void 1031 __float32_to_uint64_neon_u64x1(float scale, float *input, uint64_t *output) 1032 { 1033 float32x2_t f32x2; 1034 float64x2_t f64x2; 1035 uint64x2_t u64x2; 1036 1037 /* load 1 x float element */ 1038 f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0); 1039 1040 /* scale */ 1041 f32x2 = vmul_n_f32(f32x2, scale); 1042 1043 /* convert to float64x2_t */ 1044 f64x2 = vcvt_f64_f32(f32x2); 1045 1046 /* convert to int64x2_t */ 1047 u64x2 = vcvtaq_u64_f64(f64x2); 1048 1049 /* store 2 elements */ 1050 vst1q_lane_u64(output, u64x2, 0); 1051 } 1052 1053 int 1054 rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, void *output) 1055 { 1056 float *input_buffer; 1057 uint64_t *output_buffer; 1058 uint64_t nb_iterations; 1059 uint32_t vlen; 1060 uint64_t i; 1061 1062 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1063 return -EINVAL; 1064 1065 input_buffer = (float *)input; 1066 output_buffer = (uint64_t *)output; 1067 vlen = 4 * sizeof(float) / sizeof(uint64_t); 1068 nb_iterations = nb_elements / vlen; 1069 1070 /* convert vlen elements in each iteration */ 1071 for (i = 0; i < nb_iterations; i++) { 1072 __float32_to_uint64_neon_u64x2(scale, input_buffer, output_buffer); 1073 input_buffer += vlen; 1074 output_buffer += vlen; 1075 } 1076 1077 /* convert leftover elements */ 1078 i = i * vlen; 1079 for (; i < nb_elements; i++) { 1080 __float32_to_uint64_neon_u64x1(scale, input_buffer, output_buffer); 1081 input_buffer++; 1082 output_buffer++; 1083 } 1084 1085 return 0; 1086 } 1087 1088 static inline void 1089 __uint64_to_float32_neon_f32x2(float scale, uint64_t *input, float *output) 1090 { 1091 uint64x2_t u64x2; 1092 float64x2_t f64x2; 1093 float32x2_t f32x2; 1094 1095 /* load 2 x int64_t elements */ 1096 u64x2 = vld1q_u64(input); 1097 1098 /* convert int64x2_t to float64x2_t */ 1099 f64x2 = vcvtq_f64_u64(u64x2); 1100 1101 /* convert float64x2_t to float32x2_t */ 1102 f32x2 = vcvt_f32_f64(f64x2); 1103 1104 /* scale */ 1105 f32x2 = vmul_n_f32(f32x2, scale); 1106 1107 /* store float32x2_t */ 1108 vst1_f32(output, f32x2); 1109 } 1110 1111 static inline void 1112 __uint64_to_float32_neon_f32x1(float scale, uint64_t *input, float *output) 1113 { 1114 uint64x2_t u64x2; 1115 float64x2_t f64x2; 1116 float32x2_t f32x2; 1117 1118 /* load 2 x int64_t elements */ 1119 u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0); 1120 1121 /* convert int64x2_t to float64x2_t */ 1122 f64x2 = vcvtq_f64_u64(u64x2); 1123 1124 /* convert float64x2_t to float32x2_t */ 1125 f32x2 = vcvt_f32_f64(f64x2); 1126 1127 /* scale */ 1128 f32x2 = vmul_n_f32(f32x2, scale); 1129 1130 /* store float32x2_t */ 1131 vst1_lane_f32(output, f32x2, 0); 1132 } 1133 1134 int 1135 rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 1136 { 1137 uint64_t *input_buffer; 1138 float *output_buffer; 1139 uint64_t nb_iterations; 1140 uint32_t vlen; 1141 uint64_t i; 1142 1143 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1144 return -EINVAL; 1145 1146 input_buffer = (uint64_t *)input; 1147 output_buffer = (float *)output; 1148 vlen = 4 * sizeof(float) / sizeof(uint64_t); 1149 nb_iterations = nb_elements / vlen; 1150 1151 /* convert vlen elements in each iteration */ 1152 for (i = 0; i < nb_iterations; i++) { 1153 __uint64_to_float32_neon_f32x2(scale, input_buffer, output_buffer); 1154 input_buffer += vlen; 1155 output_buffer += vlen; 1156 } 1157 1158 /* convert leftover elements */ 1159 i = i * vlen; 1160 for (; i < nb_elements; i++) { 1161 __uint64_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 1162 input_buffer++; 1163 output_buffer++; 1164 } 1165 1166 return 0; 1167 } 1168 1169 static inline void 1170 __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output) 1171 { 1172 float32x4_t f32x4; 1173 float16x4_t f16x4; 1174 1175 /* load 4 x float32_t elements */ 1176 f32x4 = vld1q_f32(input); 1177 1178 /* convert to float16x4_t */ 1179 f16x4 = vcvt_f16_f32(f32x4); 1180 1181 /* store float16x4_t */ 1182 vst1_f16(output, f16x4); 1183 } 1184 1185 static inline void 1186 __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output) 1187 { 1188 float32x4_t f32x4; 1189 float16x4_t f16x4; 1190 1191 /* load element to 4 lanes */ 1192 f32x4 = vld1q_dup_f32(input); 1193 1194 /* convert float32_t to float16_t */ 1195 f16x4 = vcvt_f16_f32(f32x4); 1196 1197 /* store lane 0 / 1 element */ 1198 vst1_lane_f16(output, f16x4, 0); 1199 } 1200 1201 int 1202 rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output) 1203 { 1204 float32_t *input_buffer; 1205 float16_t *output_buffer; 1206 uint64_t nb_iterations; 1207 uint32_t vlen; 1208 uint64_t i; 1209 1210 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1211 return -EINVAL; 1212 1213 input_buffer = (float32_t *)input; 1214 output_buffer = (float16_t *)output; 1215 vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1216 nb_iterations = nb_elements / vlen; 1217 1218 /* convert vlen elements in each iteration */ 1219 for (i = 0; i < nb_iterations; i++) { 1220 __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 1221 input_buffer += vlen; 1222 output_buffer += vlen; 1223 } 1224 1225 /* convert leftover elements */ 1226 i = i * vlen; 1227 for (; i < nb_elements; i++) { 1228 __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 1229 input_buffer++; 1230 output_buffer++; 1231 } 1232 1233 return 0; 1234 } 1235 1236 static inline void 1237 __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output) 1238 { 1239 float16x4_t f16x4; 1240 float32x4_t f32x4; 1241 1242 /* load 4 x float16_t elements */ 1243 f16x4 = vld1_f16(input); 1244 1245 /* convert float16x4_t to float32x4_t */ 1246 f32x4 = vcvt_f32_f16(f16x4); 1247 1248 /* store float32x4_t */ 1249 vst1q_f32(output, f32x4); 1250 } 1251 1252 static inline void 1253 __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output) 1254 { 1255 float16x4_t f16x4; 1256 float32x4_t f32x4; 1257 1258 /* load element to 4 lanes */ 1259 f16x4 = vld1_dup_f16(input); 1260 1261 /* convert float16_t to float32_t */ 1262 f32x4 = vcvt_f32_f16(f16x4); 1263 1264 /* store 1 element */ 1265 vst1q_lane_f32(output, f32x4, 0); 1266 } 1267 1268 int 1269 rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output) 1270 { 1271 float16_t *input_buffer; 1272 float32_t *output_buffer; 1273 uint64_t nb_iterations; 1274 uint32_t vlen; 1275 uint64_t i; 1276 1277 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1278 return -EINVAL; 1279 1280 input_buffer = (float16_t *)input; 1281 output_buffer = (float32_t *)output; 1282 vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1283 nb_iterations = nb_elements / vlen; 1284 1285 /* convert vlen elements in each iteration */ 1286 for (i = 0; i < nb_iterations; i++) { 1287 __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 1288 input_buffer += vlen; 1289 output_buffer += vlen; 1290 } 1291 1292 /* convert leftover elements */ 1293 i = i * vlen; 1294 for (; i < nb_elements; i++) { 1295 __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 1296 input_buffer++; 1297 output_buffer++; 1298 } 1299 1300 return 0; 1301 } 1302