1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2022 Marvell. 3 */ 4 5 #include "mldev_utils_scalar.h" 6 7 /* Description: 8 * This file implements scalar versions of Machine Learning utility functions used to convert data 9 * types from higher precision to lower precision and vice-versa, except bfloat16. 10 */ 11 12 int 13 rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale, 14 int8_t zero_point) 15 { 16 const float *input_buffer; 17 int8_t *output_buffer; 18 uint64_t i; 19 int i32; 20 21 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 22 return -EINVAL; 23 24 input_buffer = (const float *)input; 25 output_buffer = (int8_t *)output; 26 27 for (i = 0; i < nb_elements; i++) { 28 i32 = (int32_t)(round(*input_buffer / scale) + zero_point); 29 30 if (i32 < INT8_MIN) 31 i32 = INT8_MIN; 32 33 if (i32 > INT8_MAX) 34 i32 = INT8_MAX; 35 36 *output_buffer = (int8_t)i32; 37 38 input_buffer++; 39 output_buffer++; 40 } 41 42 return 0; 43 } 44 45 int 46 rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 47 int8_t zero_point) 48 { 49 const int8_t *input_buffer; 50 float *output_buffer; 51 uint64_t i; 52 53 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 54 return -EINVAL; 55 56 input_buffer = (const int8_t *)input; 57 output_buffer = (float *)output; 58 59 for (i = 0; i < nb_elements; i++) { 60 *output_buffer = scale * (float)(*input_buffer - zero_point); 61 62 input_buffer++; 63 output_buffer++; 64 } 65 66 return 0; 67 } 68 69 int 70 rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale, 71 uint8_t zero_point) 72 { 73 const float *input_buffer; 74 uint8_t *output_buffer; 75 int32_t i32; 76 uint64_t i; 77 78 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 79 return -EINVAL; 80 81 input_buffer = (const float *)input; 82 output_buffer = (uint8_t *)output; 83 84 for (i = 0; i < nb_elements; i++) { 85 i32 = (int32_t)(round(*input_buffer / scale) + zero_point); 86 87 if (i32 < 0) 88 i32 = 0; 89 90 if (i32 > UINT8_MAX) 91 i32 = UINT8_MAX; 92 93 *output_buffer = (uint8_t)i32; 94 95 input_buffer++; 96 output_buffer++; 97 } 98 99 return 0; 100 } 101 102 int 103 rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 104 uint8_t zero_point) 105 { 106 const uint8_t *input_buffer; 107 float *output_buffer; 108 uint64_t i; 109 110 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 111 return -EINVAL; 112 113 input_buffer = (const uint8_t *)input; 114 output_buffer = (float *)output; 115 116 for (i = 0; i < nb_elements; i++) { 117 *output_buffer = scale * (float)(*input_buffer - zero_point); 118 119 input_buffer++; 120 output_buffer++; 121 } 122 123 return 0; 124 } 125 126 int 127 rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale, 128 int16_t zero_point) 129 { 130 const float *input_buffer; 131 int16_t *output_buffer; 132 int32_t i32; 133 uint64_t i; 134 135 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 136 return -EINVAL; 137 138 input_buffer = (const float *)input; 139 output_buffer = (int16_t *)output; 140 141 for (i = 0; i < nb_elements; i++) { 142 i32 = (int32_t)(round(*input_buffer / scale) + zero_point); 143 144 if (i32 < INT16_MIN) 145 i32 = INT16_MIN; 146 147 if (i32 > INT16_MAX) 148 i32 = INT16_MAX; 149 150 *output_buffer = (int16_t)i32; 151 152 input_buffer++; 153 output_buffer++; 154 } 155 156 return 0; 157 } 158 159 int 160 rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 161 int16_t zero_point) 162 { 163 const int16_t *input_buffer; 164 float *output_buffer; 165 uint64_t i; 166 167 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 168 return -EINVAL; 169 170 input_buffer = (const int16_t *)input; 171 output_buffer = (float *)output; 172 173 for (i = 0; i < nb_elements; i++) { 174 *output_buffer = scale * (float)(*input_buffer - zero_point); 175 176 input_buffer++; 177 output_buffer++; 178 } 179 180 return 0; 181 } 182 183 int 184 rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale, 185 uint16_t zero_point) 186 { 187 const float *input_buffer; 188 uint16_t *output_buffer; 189 int32_t i32; 190 uint64_t i; 191 192 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 193 return -EINVAL; 194 195 input_buffer = (const float *)input; 196 output_buffer = (uint16_t *)output; 197 198 for (i = 0; i < nb_elements; i++) { 199 i32 = (int32_t)(round(*input_buffer / scale) + zero_point); 200 201 if (i32 < 0) 202 i32 = 0; 203 204 if (i32 > UINT16_MAX) 205 i32 = UINT16_MAX; 206 207 *output_buffer = (uint16_t)i32; 208 209 input_buffer++; 210 output_buffer++; 211 } 212 213 return 0; 214 } 215 216 int 217 rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 218 uint16_t zero_point) 219 { 220 const uint16_t *input_buffer; 221 float *output_buffer; 222 uint64_t i; 223 224 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 225 return -EINVAL; 226 227 input_buffer = (const uint16_t *)input; 228 output_buffer = (float *)output; 229 230 for (i = 0; i < nb_elements; i++) { 231 *output_buffer = scale * (float)(*input_buffer - zero_point); 232 233 input_buffer++; 234 output_buffer++; 235 } 236 237 return 0; 238 } 239 240 int 241 rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale, 242 int32_t zero_point) 243 { 244 const float *input_buffer; 245 int32_t *output_buffer; 246 uint64_t i; 247 248 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 249 return -EINVAL; 250 251 input_buffer = (const float *)input; 252 output_buffer = (int32_t *)output; 253 254 for (i = 0; i < nb_elements; i++) { 255 *output_buffer = (int32_t)(round(*input_buffer / scale) + zero_point); 256 257 input_buffer++; 258 output_buffer++; 259 } 260 261 return 0; 262 } 263 264 int 265 rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 266 int32_t zero_point) 267 { 268 const int32_t *input_buffer; 269 float *output_buffer; 270 uint64_t i; 271 272 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 273 return -EINVAL; 274 275 input_buffer = (const int32_t *)input; 276 output_buffer = (float *)output; 277 278 for (i = 0; i < nb_elements; i++) { 279 *output_buffer = scale * (float)(*input_buffer - zero_point); 280 281 input_buffer++; 282 output_buffer++; 283 } 284 285 return 0; 286 } 287 288 int 289 rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale, 290 uint32_t zero_point) 291 { 292 const float *input_buffer; 293 uint32_t *output_buffer; 294 int32_t i32; 295 uint64_t i; 296 297 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 298 return -EINVAL; 299 300 input_buffer = (const float *)input; 301 output_buffer = (uint32_t *)output; 302 303 for (i = 0; i < nb_elements; i++) { 304 i32 = (int32_t)(round(*input_buffer / scale) + zero_point); 305 306 if (i32 < 0) 307 i32 = 0; 308 309 *output_buffer = (uint32_t)i32; 310 311 input_buffer++; 312 output_buffer++; 313 } 314 315 return 0; 316 } 317 318 int 319 rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 320 uint32_t zero_point) 321 { 322 const uint32_t *input_buffer; 323 float *output_buffer; 324 uint64_t i; 325 326 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 327 return -EINVAL; 328 329 input_buffer = (const uint32_t *)input; 330 output_buffer = (float *)output; 331 332 for (i = 0; i < nb_elements; i++) { 333 *output_buffer = scale * (float)(*input_buffer - zero_point); 334 335 input_buffer++; 336 output_buffer++; 337 } 338 339 return 0; 340 } 341 342 int 343 rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale, 344 int64_t zero_point) 345 { 346 const float *input_buffer; 347 int64_t *output_buffer; 348 uint64_t i; 349 350 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 351 return -EINVAL; 352 353 input_buffer = (const float *)input; 354 output_buffer = (int64_t *)output; 355 356 for (i = 0; i < nb_elements; i++) { 357 *output_buffer = (int64_t)(round(*input_buffer / scale) + zero_point); 358 359 input_buffer++; 360 output_buffer++; 361 } 362 363 return 0; 364 } 365 366 int 367 rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 368 int64_t zero_point) 369 { 370 const int64_t *input_buffer; 371 float *output_buffer; 372 uint64_t i; 373 374 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 375 return -EINVAL; 376 377 input_buffer = (const int64_t *)input; 378 output_buffer = (float *)output; 379 380 for (i = 0; i < nb_elements; i++) { 381 *output_buffer = scale * (float)(*input_buffer - zero_point); 382 383 input_buffer++; 384 output_buffer++; 385 } 386 387 return 0; 388 } 389 390 int 391 rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale, 392 uint64_t zero_point) 393 { 394 const float *input_buffer; 395 uint64_t *output_buffer; 396 int64_t i64; 397 uint64_t i; 398 399 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 400 return -EINVAL; 401 402 input_buffer = (const float *)input; 403 output_buffer = (uint64_t *)output; 404 405 for (i = 0; i < nb_elements; i++) { 406 i64 = (int64_t)(round(*input_buffer / scale) + zero_point); 407 408 if (i64 < 0) 409 i64 = 0; 410 411 *output_buffer = (uint64_t)i64; 412 413 input_buffer++; 414 output_buffer++; 415 } 416 417 return 0; 418 } 419 420 int 421 rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 422 uint64_t zero_point) 423 { 424 const uint64_t *input_buffer; 425 float *output_buffer; 426 uint64_t i; 427 428 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 429 return -EINVAL; 430 431 input_buffer = (const uint64_t *)input; 432 output_buffer = (float *)output; 433 434 for (i = 0; i < nb_elements; i++) { 435 *output_buffer = scale * (float)(*input_buffer - zero_point); 436 437 input_buffer++; 438 output_buffer++; 439 } 440 441 return 0; 442 } 443 444 /* Convert a single precision floating point number (float32) into a half precision 445 * floating point number (float16) using round to nearest rounding mode. 446 */ 447 static uint16_t 448 __float32_to_float16_scalar_rtn(float x) 449 { 450 union float32 f32; /* float32 input */ 451 uint32_t f32_s; /* float32 sign */ 452 uint32_t f32_e; /* float32 exponent */ 453 uint32_t f32_m; /* float32 mantissa */ 454 uint16_t f16_s; /* float16 sign */ 455 uint16_t f16_e; /* float16 exponent */ 456 uint16_t f16_m; /* float16 mantissa */ 457 uint32_t tbits; /* number of truncated bits */ 458 uint32_t tmsb; /* MSB position of truncated bits */ 459 uint32_t m_32; /* temporary float32 mantissa */ 460 uint16_t m_16; /* temporary float16 mantissa */ 461 uint16_t u16; /* float16 output */ 462 int be_16; /* float16 biased exponent, signed */ 463 464 f32.f = x; 465 f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S; 466 f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E; 467 f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M; 468 469 f16_s = f32_s; 470 f16_e = 0; 471 f16_m = 0; 472 473 switch (f32_e) { 474 case (0): /* float32: zero or subnormal number */ 475 f16_e = 0; 476 f16_m = 0; /* convert to zero */ 477 break; 478 case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */ 479 f16_e = FP16_MASK_E >> FP16_LSB_E; 480 if (f32_m == 0) { /* infinity */ 481 f16_m = 0; 482 } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */ 483 f16_m = f32_m >> (FP32_MSB_M - FP16_MSB_M); 484 f16_m |= BIT(FP16_MSB_M); 485 } 486 break; 487 default: /* float32: normal number */ 488 /* compute biased exponent for float16 */ 489 be_16 = (int)f32_e - FP32_BIAS_E + FP16_BIAS_E; 490 491 /* overflow, be_16 = [31-INF], set to infinity */ 492 if (be_16 >= (int)(FP16_MASK_E >> FP16_LSB_E)) { 493 f16_e = FP16_MASK_E >> FP16_LSB_E; 494 f16_m = 0; 495 } else if ((be_16 >= 1) && (be_16 < (int)(FP16_MASK_E >> FP16_LSB_E))) { 496 /* normal float16, be_16 = [1:30]*/ 497 f16_e = be_16; 498 m_16 = f32_m >> (FP32_LSB_E - FP16_LSB_E); 499 tmsb = FP32_MSB_M - FP16_MSB_M - 1; 500 if ((f32_m & GENMASK_U32(tmsb, 0)) > BIT(tmsb)) { 501 /* round: non-zero truncated bits except MSB */ 502 m_16++; 503 504 /* overflow into exponent */ 505 if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) 506 f16_e++; 507 } else if ((f32_m & GENMASK_U32(tmsb, 0)) == BIT(tmsb)) { 508 /* round: MSB of truncated bits and LSB of m_16 is set */ 509 if ((m_16 & 0x1) == 0x1) { 510 m_16++; 511 512 /* overflow into exponent */ 513 if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) 514 f16_e++; 515 } 516 } 517 f16_m = m_16 & FP16_MASK_M; 518 } else if ((be_16 >= -(int)(FP16_MSB_M)) && (be_16 < 1)) { 519 /* underflow: zero / subnormal, be_16 = [-9:0] */ 520 f16_e = 0; 521 522 /* add implicit leading zero */ 523 m_32 = f32_m | BIT(FP32_LSB_E); 524 tbits = FP32_LSB_E - FP16_LSB_E - be_16 + 1; 525 m_16 = m_32 >> tbits; 526 527 /* if non-leading truncated bits are set */ 528 if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) { 529 m_16++; 530 531 /* overflow into exponent */ 532 if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) 533 f16_e++; 534 } else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) { 535 /* if leading truncated bit is set */ 536 if ((m_16 & 0x1) == 0x1) { 537 m_16++; 538 539 /* overflow into exponent */ 540 if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) 541 f16_e++; 542 } 543 } 544 f16_m = m_16 & FP16_MASK_M; 545 } else if (be_16 == -(int)(FP16_MSB_M + 1)) { 546 /* underflow: zero, be_16 = [-10] */ 547 f16_e = 0; 548 if (f32_m != 0) 549 f16_m = 1; 550 else 551 f16_m = 0; 552 } else { 553 /* underflow: zero, be_16 = [-INF:-11] */ 554 f16_e = 0; 555 f16_m = 0; 556 } 557 558 break; 559 } 560 561 u16 = FP16_PACK(f16_s, f16_e, f16_m); 562 563 return u16; 564 } 565 566 int 567 rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements) 568 { 569 const float *input_buffer; 570 uint16_t *output_buffer; 571 uint64_t i; 572 573 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 574 return -EINVAL; 575 576 input_buffer = (const float *)input; 577 output_buffer = (uint16_t *)output; 578 579 for (i = 0; i < nb_elements; i++) { 580 *output_buffer = __float32_to_float16_scalar_rtn(*input_buffer); 581 582 input_buffer = input_buffer + 1; 583 output_buffer = output_buffer + 1; 584 } 585 586 return 0; 587 } 588 589 /* Convert a half precision floating point number (float16) into a single precision 590 * floating point number (float32). 591 */ 592 static float 593 __float16_to_float32_scalar_rtx(uint16_t f16) 594 { 595 union float32 f32; /* float32 output */ 596 uint16_t f16_s; /* float16 sign */ 597 uint16_t f16_e; /* float16 exponent */ 598 uint16_t f16_m; /* float16 mantissa */ 599 uint32_t f32_s; /* float32 sign */ 600 uint32_t f32_e; /* float32 exponent */ 601 uint32_t f32_m; /* float32 mantissa*/ 602 uint8_t shift; /* number of bits to be shifted */ 603 uint32_t clz; /* count of leading zeroes */ 604 int e_16; /* float16 exponent unbiased */ 605 606 f16_s = (f16 & FP16_MASK_S) >> FP16_LSB_S; 607 f16_e = (f16 & FP16_MASK_E) >> FP16_LSB_E; 608 f16_m = (f16 & FP16_MASK_M) >> FP16_LSB_M; 609 610 f32_s = f16_s; 611 switch (f16_e) { 612 case (FP16_MASK_E >> FP16_LSB_E): /* float16: infinity or nan */ 613 f32_e = FP32_MASK_E >> FP32_LSB_E; 614 if (f16_m == 0x0) { /* infinity */ 615 f32_m = f16_m; 616 } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */ 617 f32_m = f16_m; 618 shift = FP32_MSB_M - FP16_MSB_M; 619 f32_m = (f32_m << shift) & FP32_MASK_M; 620 f32_m |= BIT(FP32_MSB_M); 621 } 622 break; 623 case 0: /* float16: zero or sub-normal */ 624 f32_m = f16_m; 625 if (f16_m == 0) { /* zero signed */ 626 f32_e = 0; 627 } else { /* subnormal numbers */ 628 clz = rte_clz32((uint32_t)f16_m) - sizeof(uint32_t) * 8 + FP16_LSB_E; 629 e_16 = (int)f16_e - clz; 630 f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E; 631 632 shift = clz + (FP32_MSB_M - FP16_MSB_M) + 1; 633 f32_m = (f32_m << shift) & FP32_MASK_M; 634 } 635 break; 636 default: /* normal numbers */ 637 f32_m = f16_m; 638 e_16 = (int)f16_e; 639 f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E; 640 641 shift = (FP32_MSB_M - FP16_MSB_M); 642 f32_m = (f32_m << shift) & FP32_MASK_M; 643 } 644 645 f32.u = FP32_PACK(f32_s, f32_e, f32_m); 646 647 return f32.f; 648 } 649 650 int 651 rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements) 652 { 653 const uint16_t *input_buffer; 654 float *output_buffer; 655 uint64_t i; 656 657 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 658 return -EINVAL; 659 660 input_buffer = (const uint16_t *)input; 661 output_buffer = (float *)output; 662 663 for (i = 0; i < nb_elements; i++) { 664 *output_buffer = __float16_to_float32_scalar_rtx(*input_buffer); 665 666 input_buffer = input_buffer + 1; 667 output_buffer = output_buffer + 1; 668 } 669 670 return 0; 671 } 672