1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2022 Marvell. 3 */ 4 5 #include <errno.h> 6 #include <stdint.h> 7 #include <stdlib.h> 8 9 #include "mldev_utils.h" 10 11 #include <arm_neon.h> 12 13 /* Description: 14 * This file implements vector versions of Machine Learning utility functions used to convert data 15 * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16 * is based on Arm Neon intrinsics. 17 */ 18 19 static inline void 20 __float32_to_int8_neon_s8x8(const float *input, int8_t *output, float scale, int8_t zero_point) 21 { 22 int16x4_t s16x4_l; 23 int16x4_t s16x4_h; 24 float32x4_t f32x4; 25 int16x8_t s16x8; 26 int32x4_t s32x4; 27 int8x8_t s8x8; 28 29 /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30 * Use round to nearest with ties away rounding mode. 31 */ 32 f32x4 = vld1q_f32(input); 33 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 34 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 35 s32x4 = vcvtaq_s32_f32(f32x4); 36 s16x4_l = vqmovn_s32(s32x4); 37 38 /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 39 * Use round to nearest with ties away rounding mode. 40 */ 41 f32x4 = vld1q_f32(input + 4); 42 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 43 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 44 s32x4 = vcvtaq_s32_f32(f32x4); 45 s16x4_h = vqmovn_s32(s32x4); 46 47 /* combine lower and higher int16x4_t to int16x8_t */ 48 s16x8 = vcombine_s16(s16x4_l, s16x4_h); 49 50 /* narrow to int8_t */ 51 s8x8 = vqmovn_s16(s16x8); 52 s8x8 = vmax_s8(s8x8, vdup_n_s8(INT8_MIN + 1)); 53 54 /* store 8 elements */ 55 vst1_s8(output, s8x8); 56 } 57 58 static inline void 59 __float32_to_int8_neon_s8x1(const float *input, int8_t *output, float scale, int8_t zero_point) 60 { 61 float32x2_t f32x2; 62 int32x2_t s32x2; 63 int16_t s16; 64 65 /* scale and convert, round to nearest with ties away rounding mode */ 66 f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 67 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 68 s32x2 = vcvta_s32_f32(f32x2); 69 s32x2 = vmax_s32(s32x2, vdup_n_s32(INT8_MIN + 1)); 70 71 /* saturate narrow */ 72 s16 = vqmovns_s32(vget_lane_s32(s32x2, 0)); 73 74 /* convert to int8_t */ 75 *output = vqmovnh_s16(s16); 76 } 77 78 int 79 rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale, 80 int8_t zero_point) 81 { 82 const float *input_buffer; 83 int8_t *output_buffer; 84 uint64_t nb_iterations; 85 uint32_t vlen; 86 uint64_t i; 87 88 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 89 return -EINVAL; 90 91 input_buffer = (const float *)input; 92 output_buffer = (int8_t *)output; 93 vlen = 2 * sizeof(float) / sizeof(int8_t); 94 nb_iterations = nb_elements / vlen; 95 96 /* convert vlen elements in each iteration */ 97 for (i = 0; i < nb_iterations; i++) { 98 __float32_to_int8_neon_s8x8(input_buffer, output_buffer, scale, zero_point); 99 input_buffer += vlen; 100 output_buffer += vlen; 101 } 102 103 /* convert leftover elements */ 104 i = i * vlen; 105 for (; i < nb_elements; i++) { 106 __float32_to_int8_neon_s8x1(input_buffer, output_buffer, scale, zero_point); 107 input_buffer++; 108 output_buffer++; 109 } 110 111 return 0; 112 } 113 114 static inline void 115 __int8_to_float32_neon_f32x8(const int8_t *input, float *output, float scale, int8_t zero_point) 116 { 117 float32x4_t f32x4; 118 int16x8_t s16x8; 119 int16x4_t s16x4; 120 int32x4_t s32x4; 121 int8x8_t s8x8; 122 123 /* load 8 x int8_t elements */ 124 s8x8 = vld1_s8(input); 125 126 /* widen int8_t to int16_t */ 127 s16x8 = vmovl_s8(s8x8); 128 129 /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 130 s16x4 = vget_low_s16(s16x8); 131 s32x4 = vmovl_s16(s16x4); 132 f32x4 = vcvtq_f32_s32(s32x4); 133 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 134 f32x4 = vmulq_n_f32(f32x4, scale); 135 vst1q_f32(output, f32x4); 136 137 /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 138 s16x4 = vget_high_s16(s16x8); 139 s32x4 = vmovl_s16(s16x4); 140 f32x4 = vcvtq_f32_s32(s32x4); 141 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 142 f32x4 = vmulq_n_f32(f32x4, scale); 143 vst1q_f32(output + 4, f32x4); 144 } 145 146 static inline void 147 __int8_to_float32_neon_f32x1(const int8_t *input, float *output, float scale, int8_t zero_point) 148 { 149 *output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point); 150 } 151 152 int 153 rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 154 int8_t zero_point) 155 { 156 const int8_t *input_buffer; 157 float *output_buffer; 158 uint64_t nb_iterations; 159 uint32_t vlen; 160 uint64_t i; 161 162 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 163 return -EINVAL; 164 165 input_buffer = (const int8_t *)input; 166 output_buffer = (float *)output; 167 vlen = 2 * sizeof(float) / sizeof(int8_t); 168 nb_iterations = nb_elements / vlen; 169 170 /* convert vlen elements in each iteration */ 171 for (i = 0; i < nb_iterations; i++) { 172 __int8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point); 173 input_buffer += vlen; 174 output_buffer += vlen; 175 } 176 177 /* convert leftover elements */ 178 i = i * vlen; 179 for (; i < nb_elements; i++) { 180 __int8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 181 input_buffer++; 182 output_buffer++; 183 } 184 185 return 0; 186 } 187 188 static inline void 189 __float32_to_uint8_neon_u8x8(const float *input, uint8_t *output, float scale, uint8_t zero_point) 190 { 191 uint16x4_t u16x4_l; 192 uint16x4_t u16x4_h; 193 float32x4_t f32x4; 194 uint32x4_t u32x4; 195 uint16x8_t u16x8; 196 uint8x8_t u8x8; 197 198 /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 199 * use round to nearest with ties away rounding mode. 200 */ 201 f32x4 = vld1q_f32(input); 202 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 203 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 204 u32x4 = vcvtaq_u32_f32(f32x4); 205 u16x4_l = vqmovn_u32(u32x4); 206 207 /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 208 * use round to nearest with ties away rounding mode. 209 */ 210 f32x4 = vld1q_f32(input + 4); 211 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 212 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 213 u32x4 = vcvtaq_u32_f32(f32x4); 214 u16x4_h = vqmovn_u32(u32x4); 215 216 /* combine lower and higher uint16x4_t */ 217 u16x8 = vcombine_u16(u16x4_l, u16x4_h); 218 219 /* narrow to uint8x8_t */ 220 u8x8 = vqmovn_u16(u16x8); 221 222 /* store 8 elements */ 223 vst1_u8(output, u8x8); 224 } 225 226 static inline void 227 __float32_to_uint8_neon_u8x1(const float *input, uint8_t *output, float scale, uint8_t zero_point) 228 { 229 float32x2_t f32x2; 230 uint32x2_t u32x2; 231 uint16_t u16; 232 233 /* scale and convert, round to nearest with ties away rounding mode */ 234 f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 235 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 236 u32x2 = vcvta_u32_f32(f32x2); 237 238 /* saturate narrow */ 239 u16 = vqmovns_u32(vget_lane_u32(u32x2, 0)); 240 241 /* convert to uint8_t */ 242 *output = vqmovnh_u16(u16); 243 } 244 245 int 246 rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale, 247 uint8_t zero_point) 248 { 249 const float *input_buffer; 250 uint8_t *output_buffer; 251 uint64_t nb_iterations; 252 uint32_t vlen; 253 uint64_t i; 254 255 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 256 return -EINVAL; 257 258 input_buffer = (const float *)input; 259 output_buffer = (uint8_t *)output; 260 vlen = 2 * sizeof(float) / sizeof(uint8_t); 261 nb_iterations = nb_elements / vlen; 262 263 /* convert vlen elements in each iteration */ 264 for (i = 0; i < nb_iterations; i++) { 265 __float32_to_uint8_neon_u8x8(input_buffer, output_buffer, scale, zero_point); 266 input_buffer += vlen; 267 output_buffer += vlen; 268 } 269 270 /* convert leftover elements */ 271 i = i * vlen; 272 for (; i < nb_elements; i++) { 273 __float32_to_uint8_neon_u8x1(input_buffer, output_buffer, scale, zero_point); 274 input_buffer++; 275 output_buffer++; 276 } 277 278 return 0; 279 } 280 281 static inline void 282 __uint8_to_float32_neon_f32x8(const uint8_t *input, float *output, float scale, uint8_t zero_point) 283 { 284 float32x4_t f32x4; 285 uint16x8_t u16x8; 286 int16x8_t s16x8; 287 int16x4_t s16x4; 288 int32x4_t s32x4; 289 uint8x8_t u8x8; 290 291 /* load 8 x uint8_t elements */ 292 u8x8 = vld1_u8(input); 293 u16x8 = vmovl_u8(u8x8); 294 s16x8 = vreinterpretq_s16_u16(u16x8); 295 296 /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 297 s16x4 = vget_low_s16(s16x8); 298 s32x4 = vmovl_s16(s16x4); 299 f32x4 = vcvtq_f32_s32(s32x4); 300 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 301 f32x4 = vmulq_n_f32(f32x4, scale); 302 vst1q_f32(output, f32x4); 303 304 /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 305 s16x4 = vget_high_s16(s16x8); 306 s32x4 = vmovl_s16(s16x4); 307 f32x4 = vcvtq_f32_s32(s32x4); 308 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 309 f32x4 = vmulq_n_f32(f32x4, scale); 310 vst1q_f32(output + 4, f32x4); 311 } 312 313 static inline void 314 __uint8_to_float32_neon_f32x1(const uint8_t *input, float *output, float scale, uint8_t zero_point) 315 { 316 *output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point); 317 } 318 319 int 320 rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 321 uint8_t zero_point) 322 { 323 const uint8_t *input_buffer; 324 float *output_buffer; 325 uint64_t nb_iterations; 326 uint64_t vlen; 327 uint64_t i; 328 329 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 330 return -EINVAL; 331 332 input_buffer = (const uint8_t *)input; 333 output_buffer = (float *)output; 334 vlen = 2 * sizeof(float) / sizeof(uint8_t); 335 nb_iterations = nb_elements / vlen; 336 337 /* convert vlen elements in each iteration */ 338 for (i = 0; i < nb_iterations; i++) { 339 __uint8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point); 340 input_buffer += vlen; 341 output_buffer += vlen; 342 } 343 344 /* convert leftover elements */ 345 i = i * vlen; 346 for (; i < nb_elements; i++) { 347 __uint8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 348 input_buffer++; 349 output_buffer++; 350 } 351 352 return 0; 353 } 354 355 static inline void 356 __float32_to_int16_neon_s16x4(const float *input, int16_t *output, float scale, int16_t zero_point) 357 { 358 float32x4_t f32x4; 359 int16x4_t s16x4; 360 int32x4_t s32x4; 361 362 /* load 4 x float elements */ 363 f32x4 = vld1q_f32(input); 364 365 /* scale */ 366 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 367 368 /* add zero point */ 369 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 370 371 /* convert to int32x4_t using round to nearest with ties away rounding mode */ 372 s32x4 = vcvtaq_s32_f32(f32x4); 373 374 /* saturate narrow to int16x4_t */ 375 s16x4 = vqmovn_s32(s32x4); 376 s16x4 = vmax_s16(s16x4, vdup_n_s16(INT16_MIN + 1)); 377 378 /* store 4 elements */ 379 vst1_s16(output, s16x4); 380 } 381 382 static inline void 383 __float32_to_int16_neon_s16x1(const float *input, int16_t *output, float scale, int16_t zero_point) 384 { 385 float32x2_t f32x2; 386 int32x2_t s32x2; 387 388 /* scale and convert, round to nearest with ties away rounding mode */ 389 f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 390 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 391 s32x2 = vcvta_s32_f32(f32x2); 392 s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1)); 393 394 /* saturate narrow */ 395 *output = vqmovns_s32(vget_lane_s32(s32x2, 0)); 396 } 397 398 int 399 rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale, 400 int16_t zero_point) 401 { 402 const float *input_buffer; 403 int16_t *output_buffer; 404 uint64_t nb_iterations; 405 uint32_t vlen; 406 uint64_t i; 407 408 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 409 return -EINVAL; 410 411 input_buffer = (const float *)input; 412 output_buffer = (int16_t *)output; 413 vlen = 2 * sizeof(float) / sizeof(int16_t); 414 nb_iterations = nb_elements / vlen; 415 416 /* convert vlen elements in each iteration */ 417 for (i = 0; i < nb_iterations; i++) { 418 __float32_to_int16_neon_s16x4(input_buffer, output_buffer, scale, zero_point); 419 input_buffer += vlen; 420 output_buffer += vlen; 421 } 422 423 /* convert leftover elements */ 424 i = i * vlen; 425 for (; i < nb_elements; i++) { 426 __float32_to_int16_neon_s16x1(input_buffer, output_buffer, scale, zero_point); 427 input_buffer++; 428 output_buffer++; 429 } 430 431 return 0; 432 } 433 434 static inline void 435 __int16_to_float32_neon_f32x4(const int16_t *input, float *output, float scale, int16_t zero_point) 436 { 437 float32x4_t f32x4; 438 int16x4_t s16x4; 439 int32x4_t s32x4; 440 441 /* load 4 x int16_t elements */ 442 s16x4 = vld1_s16(input); 443 444 /* widen int16_t to int32_t */ 445 s32x4 = vmovl_s16(s16x4); 446 447 /* convert int32_t to float */ 448 f32x4 = vcvtq_f32_s32(s32x4); 449 450 /* subtract zero point */ 451 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 452 453 /* scale */ 454 f32x4 = vmulq_n_f32(f32x4, scale); 455 456 /* store float32x4_t */ 457 vst1q_f32(output, f32x4); 458 } 459 460 static inline void 461 __int16_to_float32_neon_f32x1(const int16_t *input, float *output, float scale, int16_t zero_point) 462 { 463 *output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point); 464 } 465 466 int 467 rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 468 int16_t zero_point) 469 { 470 const int16_t *input_buffer; 471 float *output_buffer; 472 uint64_t nb_iterations; 473 uint32_t vlen; 474 uint64_t i; 475 476 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 477 return -EINVAL; 478 479 input_buffer = (const int16_t *)input; 480 output_buffer = (float *)output; 481 vlen = 2 * sizeof(float) / sizeof(int16_t); 482 nb_iterations = nb_elements / vlen; 483 484 /* convert vlen elements in each iteration */ 485 for (i = 0; i < nb_iterations; i++) { 486 __int16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 487 input_buffer += vlen; 488 output_buffer += vlen; 489 } 490 491 /* convert leftover elements */ 492 i = i * vlen; 493 for (; i < nb_elements; i++) { 494 __int16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 495 input_buffer++; 496 output_buffer++; 497 } 498 499 return 0; 500 } 501 502 static inline void 503 __float32_to_uint16_neon_u16x4(const float *input, uint16_t *output, float scale, 504 uint16_t zero_point) 505 { 506 float32x4_t f32x4; 507 uint16x4_t u16x4; 508 uint32x4_t u32x4; 509 510 /* load 4 float elements */ 511 f32x4 = vld1q_f32(input); 512 513 /* scale */ 514 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 515 516 /* add zero point */ 517 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 518 519 /* convert using round to nearest with ties to away rounding mode */ 520 u32x4 = vcvtaq_u32_f32(f32x4); 521 522 /* saturate narrow */ 523 u16x4 = vqmovn_u32(u32x4); 524 525 /* store 4 elements */ 526 vst1_u16(output, u16x4); 527 } 528 529 static inline void 530 __float32_to_uint16_neon_u16x1(const float *input, uint16_t *output, float scale, 531 uint16_t zero_point) 532 { 533 uint32_t u32; 534 535 /* scale and convert, round to nearest with ties away rounding mode */ 536 u32 = vcvtas_u32_f32((*input) / scale + (float)zero_point); 537 538 /* saturate narrow */ 539 *output = vqmovns_u32(u32) + zero_point; 540 } 541 542 int 543 rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale, 544 uint16_t zero_point) 545 { 546 const float *input_buffer; 547 uint16_t *output_buffer; 548 uint64_t nb_iterations; 549 uint64_t vlen; 550 uint64_t i; 551 552 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 553 return -EINVAL; 554 555 input_buffer = (const float *)input; 556 output_buffer = (uint16_t *)output; 557 vlen = 2 * sizeof(float) / sizeof(uint16_t); 558 nb_iterations = nb_elements / vlen; 559 560 /* convert vlen elements in each iteration */ 561 for (i = 0; i < nb_iterations; i++) { 562 __float32_to_uint16_neon_u16x4(input_buffer, output_buffer, scale, zero_point); 563 input_buffer += vlen; 564 output_buffer += vlen; 565 } 566 567 /* convert leftover elements */ 568 i = i * vlen; 569 for (; i < nb_elements; i++) { 570 __float32_to_uint16_neon_u16x1(input_buffer, output_buffer, scale, zero_point); 571 input_buffer++; 572 output_buffer++; 573 } 574 575 return 0; 576 } 577 578 static inline void 579 __uint16_to_float32_neon_f32x4(const uint16_t *input, float *output, float scale, 580 uint16_t zero_point) 581 { 582 float32x4_t f32x4; 583 uint16x4_t u16x4; 584 uint32x4_t u32x4; 585 586 /* load 4 x uint16_t elements */ 587 u16x4 = vld1_u16(input); 588 589 /* widen uint16_t to uint32_t */ 590 u32x4 = vmovl_u16(u16x4); 591 592 /* convert uint32_t to float */ 593 f32x4 = vcvtq_f32_u32(u32x4); 594 595 /* subtract zero point */ 596 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 597 598 /* scale */ 599 f32x4 = vmulq_n_f32(f32x4, scale); 600 601 /* store float32x4_t */ 602 vst1q_f32(output, f32x4); 603 } 604 605 static inline void 606 __uint16_to_float32_neon_f32x1(const uint16_t *input, float *output, float scale, 607 uint16_t zero_point) 608 { 609 *output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point); 610 } 611 612 int 613 rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 614 uint16_t zero_point) 615 { 616 const uint16_t *input_buffer; 617 float *output_buffer; 618 uint64_t nb_iterations; 619 uint32_t vlen; 620 uint64_t i; 621 622 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 623 return -EINVAL; 624 625 input_buffer = (const uint16_t *)input; 626 output_buffer = (float *)output; 627 vlen = 2 * sizeof(float) / sizeof(uint16_t); 628 nb_iterations = nb_elements / vlen; 629 630 /* convert vlen elements in each iteration */ 631 for (i = 0; i < nb_iterations; i++) { 632 __uint16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 633 input_buffer += vlen; 634 output_buffer += vlen; 635 } 636 637 /* convert leftover elements */ 638 i = i * vlen; 639 for (; i < nb_elements; i++) { 640 __uint16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 641 input_buffer++; 642 output_buffer++; 643 } 644 645 return 0; 646 } 647 648 static inline void 649 __float32_to_int32_neon_s32x4(const float *input, int32_t *output, float scale, int32_t zero_point) 650 { 651 float32x4_t f32x4; 652 int32x4_t s32x4; 653 654 /* load 4 x float elements */ 655 f32x4 = vld1q_f32(input); 656 657 /* scale */ 658 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 659 660 /* add zero point */ 661 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 662 663 /* convert to int32x4_t using round to nearest with ties away rounding mode */ 664 s32x4 = vcvtaq_s32_f32(f32x4); 665 666 /* add zero_point */ 667 s32x4 = vaddq_s32(s32x4, vdupq_n_s32(zero_point)); 668 s32x4 = vmaxq_s32(s32x4, vdupq_n_s32(INT32_MIN + 1)); 669 670 /* store 4 elements */ 671 vst1q_s32(output, s32x4); 672 } 673 674 static inline void 675 __float32_to_int32_neon_s32x1(const float *input, int32_t *output, float scale, int32_t zero_point) 676 { 677 float32x2_t f32x2; 678 int32x2_t s32x2; 679 680 /* scale and convert, round to nearest with ties away rounding mode */ 681 f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 682 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 683 s32x2 = vcvta_s32_f32(f32x2); 684 s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1)); 685 686 /* saturate narrow */ 687 vst1_lane_s32(output, s32x2, 0); 688 } 689 690 int 691 rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale, 692 int32_t zero_point) 693 { 694 const float *input_buffer; 695 int32_t *output_buffer; 696 uint64_t nb_iterations; 697 uint32_t vlen; 698 uint64_t i; 699 700 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 701 return -EINVAL; 702 703 input_buffer = (const float *)input; 704 output_buffer = (int32_t *)output; 705 vlen = 2 * sizeof(float) / sizeof(int32_t); 706 nb_iterations = nb_elements / vlen; 707 708 /* convert vlen elements in each iteration */ 709 for (i = 0; i < nb_iterations; i++) { 710 __float32_to_int32_neon_s32x4(input_buffer, output_buffer, scale, zero_point); 711 input_buffer += vlen; 712 output_buffer += vlen; 713 } 714 715 /* convert leftover elements */ 716 i = i * vlen; 717 for (; i < nb_elements; i++) { 718 __float32_to_int32_neon_s32x1(input_buffer, output_buffer, scale, zero_point); 719 input_buffer++; 720 output_buffer++; 721 } 722 723 return 0; 724 } 725 726 static inline void 727 __int32_to_float32_neon_f32x4(const int32_t *input, float *output, float scale, int32_t zero_point) 728 { 729 float32x4_t f32x4; 730 int32x4_t s32x4; 731 732 /* load 4 x int32_t elements */ 733 s32x4 = vld1q_s32(input); 734 735 /* convert int32_t to float */ 736 f32x4 = vcvtq_f32_s32(s32x4); 737 738 /* subtract zero point */ 739 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 740 741 /* scale */ 742 f32x4 = vmulq_n_f32(f32x4, scale); 743 744 /* store float32x4_t */ 745 vst1q_f32(output, f32x4); 746 } 747 748 static inline void 749 __int32_to_float32_neon_f32x1(const int32_t *input, float *output, float scale, int32_t zero_point) 750 { 751 *output = scale * (vcvts_f32_s32(*input) - (float)zero_point); 752 } 753 754 int 755 rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 756 int32_t zero_point) 757 { 758 const int32_t *input_buffer; 759 float *output_buffer; 760 uint64_t nb_iterations; 761 uint32_t vlen; 762 uint64_t i; 763 764 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 765 return -EINVAL; 766 767 input_buffer = (const int32_t *)input; 768 output_buffer = (float *)output; 769 vlen = 2 * sizeof(float) / sizeof(int32_t); 770 nb_iterations = nb_elements / vlen; 771 772 /* convert vlen elements in each iteration */ 773 for (i = 0; i < nb_iterations; i++) { 774 __int32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 775 input_buffer += vlen; 776 output_buffer += vlen; 777 } 778 779 /* convert leftover elements */ 780 i = i * vlen; 781 for (; i < nb_elements; i++) { 782 __int32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 783 input_buffer++; 784 output_buffer++; 785 } 786 787 return 0; 788 } 789 790 static inline void 791 __float32_to_uint32_neon_u32x4(const float *input, uint32_t *output, float scale, 792 uint32_t zero_point) 793 { 794 float32x4_t f32x4; 795 uint32x4_t u32x4; 796 797 /* load 4 float elements */ 798 f32x4 = vld1q_f32(input); 799 800 /* scale */ 801 f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 802 803 /* add zero point */ 804 f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 805 806 /* convert using round to nearest with ties to away rounding mode */ 807 u32x4 = vcvtaq_u32_f32(f32x4); 808 809 /* store 4 elements */ 810 vst1q_u32(output, u32x4); 811 } 812 813 static inline void 814 __float32_to_uint32_neon_u32x1(const float *input, uint32_t *output, float scale, 815 uint32_t zero_point) 816 { 817 /* scale and convert, round to nearest with ties away rounding mode */ 818 *output = vcvtas_u32_f32((*input) / scale + (float)zero_point); 819 } 820 821 int 822 rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale, 823 uint32_t zero_point) 824 { 825 const float *input_buffer; 826 uint32_t *output_buffer; 827 uint64_t nb_iterations; 828 uint64_t vlen; 829 uint64_t i; 830 831 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 832 return -EINVAL; 833 834 input_buffer = (const float *)input; 835 output_buffer = (uint32_t *)output; 836 vlen = 2 * sizeof(float) / sizeof(uint32_t); 837 nb_iterations = nb_elements / vlen; 838 839 /* convert vlen elements in each iteration */ 840 for (i = 0; i < nb_iterations; i++) { 841 __float32_to_uint32_neon_u32x4(input_buffer, output_buffer, scale, zero_point); 842 input_buffer += vlen; 843 output_buffer += vlen; 844 } 845 846 /* convert leftover elements */ 847 i = i * vlen; 848 for (; i < nb_elements; i++) { 849 __float32_to_uint32_neon_u32x1(input_buffer, output_buffer, scale, zero_point); 850 input_buffer++; 851 output_buffer++; 852 } 853 854 return 0; 855 } 856 857 static inline void 858 __uint32_to_float32_neon_f32x4(const uint32_t *input, float *output, float scale, 859 uint32_t zero_point) 860 { 861 float32x4_t f32x4; 862 uint32x4_t u32x4; 863 864 /* load 4 x uint32_t elements */ 865 u32x4 = vld1q_u32(input); 866 867 /* convert uint32_t to float */ 868 f32x4 = vcvtq_f32_u32(u32x4); 869 870 /* subtract zero point */ 871 f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 872 873 /* scale */ 874 f32x4 = vmulq_n_f32(f32x4, scale); 875 876 /* store float32x4_t */ 877 vst1q_f32(output, f32x4); 878 } 879 880 static inline void 881 __uint32_to_float32_neon_f32x1(const uint32_t *input, float *output, float scale, 882 uint32_t zero_point) 883 { 884 *output = scale * (vcvts_f32_u32(*input) - (float)zero_point); 885 } 886 887 int 888 rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 889 uint32_t zero_point) 890 { 891 const uint32_t *input_buffer; 892 float *output_buffer; 893 uint64_t nb_iterations; 894 uint32_t vlen; 895 uint64_t i; 896 897 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 898 return -EINVAL; 899 900 input_buffer = (const uint32_t *)input; 901 output_buffer = (float *)output; 902 vlen = 2 * sizeof(float) / sizeof(uint32_t); 903 nb_iterations = nb_elements / vlen; 904 905 /* convert vlen elements in each iteration */ 906 for (i = 0; i < nb_iterations; i++) { 907 __uint32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 908 input_buffer += vlen; 909 output_buffer += vlen; 910 } 911 912 /* convert leftover elements */ 913 i = i * vlen; 914 for (; i < nb_elements; i++) { 915 __uint32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 916 input_buffer++; 917 output_buffer++; 918 } 919 920 return 0; 921 } 922 923 static inline void 924 __float32_to_int64_neon_s64x2(const float *input, int64_t *output, float scale, int64_t zero_point) 925 { 926 float32x2_t f32x2; 927 float64x2_t f64x2; 928 int64x2_t s64x2; 929 int64_t s64; 930 931 /* load 2 x float elements */ 932 f32x2 = vld1_f32(input); 933 934 /* scale */ 935 f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 936 937 /* add zero point */ 938 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 939 940 /* convert to float64x2_t */ 941 f64x2 = vcvt_f64_f32(f32x2); 942 943 /* convert to int64x2_t */ 944 s64x2 = vcvtaq_s64_f64(f64x2); 945 s64 = vgetq_lane_s64(s64x2, 0); 946 s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64; 947 948 /* store lane 0 of int64x2_t */ 949 *output = s64; 950 } 951 952 static inline void 953 __float32_to_int64_neon_s64x1(const float *input, int64_t *output, float scale, int64_t zero_point) 954 { 955 float32x2_t f32x2; 956 float64x2_t f64x2; 957 int64x2_t s64x2; 958 int64_t s64; 959 960 /* load 1 x float element */ 961 f32x2 = vdup_n_f32(*input); 962 963 /* scale */ 964 f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 965 966 /* add zero point */ 967 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 968 969 /* convert to float64x2_t */ 970 f64x2 = vcvt_f64_f32(f32x2); 971 972 /* convert to int64x2_t */ 973 s64x2 = vcvtaq_s64_f64(f64x2); 974 s64 = vgetq_lane_s64(s64x2, 0); 975 s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64; 976 977 /* store lane 0 of int64x2_t */ 978 *output = s64; 979 } 980 981 int 982 rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale, 983 int64_t zero_point) 984 { 985 const float *input_buffer; 986 int64_t *output_buffer; 987 uint64_t nb_iterations; 988 uint32_t vlen; 989 uint64_t i; 990 991 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 992 return -EINVAL; 993 994 input_buffer = (const float *)input; 995 output_buffer = (int64_t *)output; 996 vlen = 4 * sizeof(float) / sizeof(int64_t); 997 nb_iterations = nb_elements / vlen; 998 999 /* convert vlen elements in each iteration */ 1000 for (i = 0; i < nb_iterations; i++) { 1001 __float32_to_int64_neon_s64x2(input_buffer, output_buffer, scale, zero_point); 1002 input_buffer += vlen; 1003 output_buffer += vlen; 1004 } 1005 1006 /* convert leftover elements */ 1007 i = i * vlen; 1008 for (; i < nb_elements; i++) { 1009 __float32_to_int64_neon_s64x1(input_buffer, output_buffer, scale, zero_point); 1010 input_buffer++; 1011 output_buffer++; 1012 } 1013 1014 return 0; 1015 } 1016 1017 static inline void 1018 __int64_to_float32_neon_f32x2(const int64_t *input, float *output, float scale, int64_t zero_point) 1019 { 1020 int64x2_t s64x2; 1021 float64x2_t f64x2; 1022 float32x2_t f32x2; 1023 1024 /* load 2 x int64_t elements */ 1025 s64x2 = vld1q_s64(input); 1026 1027 /* convert int64x2_t to float64x2_t */ 1028 f64x2 = vcvtq_f64_s64(s64x2); 1029 1030 /* convert float64x2_t to float32x2_t */ 1031 f32x2 = vcvt_f32_f64(f64x2); 1032 1033 /* subtract zero_point */ 1034 f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point)); 1035 1036 /* scale */ 1037 f32x2 = vmul_n_f32(f32x2, scale); 1038 1039 /* store float32x2_t */ 1040 vst1_f32(output, f32x2); 1041 } 1042 1043 static inline void 1044 __int64_to_float32_neon_f32x1(const int64_t *input, float *output, float scale, int64_t zero_point) 1045 { 1046 int64x2_t s64x2; 1047 float64x2_t f64x2; 1048 float32x2_t f32x2; 1049 1050 /* load 2 x int64_t elements */ 1051 s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0); 1052 1053 /* convert int64x2_t to float64x2_t */ 1054 f64x2 = vcvtq_f64_s64(s64x2); 1055 1056 /* convert float64x2_t to float32x2_t */ 1057 f32x2 = vcvt_f32_f64(f64x2); 1058 1059 /* subtract zero_point */ 1060 f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point)); 1061 1062 /* scale */ 1063 f32x2 = vmul_n_f32(f32x2, scale); 1064 1065 /* store float32x2_t lane 0 */ 1066 vst1_lane_f32(output, f32x2, 0); 1067 } 1068 1069 int 1070 rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 1071 int64_t zero_point) 1072 { 1073 const int64_t *input_buffer; 1074 float *output_buffer; 1075 uint64_t nb_iterations; 1076 uint32_t vlen; 1077 uint64_t i; 1078 1079 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1080 return -EINVAL; 1081 1082 input_buffer = (const int64_t *)input; 1083 output_buffer = (float *)output; 1084 vlen = 4 * sizeof(float) / sizeof(int64_t); 1085 nb_iterations = nb_elements / vlen; 1086 1087 /* convert vlen elements in each iteration */ 1088 for (i = 0; i < nb_iterations; i++) { 1089 __int64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point); 1090 input_buffer += vlen; 1091 output_buffer += vlen; 1092 } 1093 1094 /* convert leftover elements */ 1095 i = i * vlen; 1096 for (; i < nb_elements; i++) { 1097 __int64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 1098 input_buffer++; 1099 output_buffer++; 1100 } 1101 1102 return 0; 1103 } 1104 1105 static inline void 1106 __float32_to_uint64_neon_u64x2(const float *input, uint64_t *output, float scale, 1107 uint64_t zero_point) 1108 { 1109 float32x2_t f32x2; 1110 float64x2_t f64x2; 1111 uint64x2_t u64x2; 1112 1113 /* load 2 x float elements */ 1114 f32x2 = vld1_f32(input); 1115 1116 /* scale */ 1117 f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 1118 1119 /* add zero point */ 1120 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 1121 1122 /* convert to float64x2_t */ 1123 f64x2 = vcvt_f64_f32(f32x2); 1124 1125 /* convert to int64x2_t */ 1126 u64x2 = vcvtaq_u64_f64(f64x2); 1127 1128 /* store 2 elements */ 1129 vst1q_u64(output, u64x2); 1130 } 1131 1132 static inline void 1133 __float32_to_uint64_neon_u64x1(const float *input, uint64_t *output, float scale, 1134 uint64_t zero_point) 1135 { 1136 float32x2_t f32x2; 1137 float64x2_t f64x2; 1138 uint64x2_t u64x2; 1139 1140 /* load 1 x float element */ 1141 f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0); 1142 1143 /* scale */ 1144 f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 1145 1146 /* add zero_point */ 1147 f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 1148 1149 /* convert to float64x2_t */ 1150 f64x2 = vcvt_f64_f32(f32x2); 1151 1152 /* convert to int64x2_t */ 1153 u64x2 = vcvtaq_u64_f64(f64x2); 1154 1155 /* store 2 elements */ 1156 vst1q_lane_u64(output, u64x2, 0); 1157 } 1158 1159 int 1160 rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale, 1161 uint64_t zero_point) 1162 { 1163 const float *input_buffer; 1164 uint64_t *output_buffer; 1165 uint64_t nb_iterations; 1166 uint32_t vlen; 1167 uint64_t i; 1168 1169 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1170 return -EINVAL; 1171 1172 input_buffer = (const float *)input; 1173 output_buffer = (uint64_t *)output; 1174 vlen = 4 * sizeof(float) / sizeof(uint64_t); 1175 nb_iterations = nb_elements / vlen; 1176 1177 /* convert vlen elements in each iteration */ 1178 for (i = 0; i < nb_iterations; i++) { 1179 __float32_to_uint64_neon_u64x2(input_buffer, output_buffer, scale, zero_point); 1180 input_buffer += vlen; 1181 output_buffer += vlen; 1182 } 1183 1184 /* convert leftover elements */ 1185 i = i * vlen; 1186 for (; i < nb_elements; i++) { 1187 __float32_to_uint64_neon_u64x1(input_buffer, output_buffer, scale, zero_point); 1188 input_buffer++; 1189 output_buffer++; 1190 } 1191 1192 return 0; 1193 } 1194 1195 static inline void 1196 __uint64_to_float32_neon_f32x2(const uint64_t *input, float *output, float scale, 1197 uint64_t zero_point) 1198 { 1199 uint64x2_t u64x2; 1200 float64x2_t f64x2; 1201 float32x2_t f32x2; 1202 1203 /* load 2 x int64_t elements */ 1204 u64x2 = vld1q_u64(input); 1205 1206 /* convert int64x2_t to float64x2_t */ 1207 f64x2 = vcvtq_f64_u64(u64x2); 1208 1209 /* convert float64x2_t to float32x2_t */ 1210 f32x2 = vcvt_f32_f64(f64x2); 1211 1212 /* subtract zero_point */ 1213 f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point)); 1214 1215 /* scale */ 1216 f32x2 = vmul_n_f32(f32x2, scale); 1217 1218 /* store float32x2_t */ 1219 vst1_f32(output, f32x2); 1220 } 1221 1222 static inline void 1223 __uint64_to_float32_neon_f32x1(const uint64_t *input, float *output, float scale, 1224 uint64_t zero_point) 1225 { 1226 uint64x2_t u64x2; 1227 float64x2_t f64x2; 1228 float32x2_t f32x2; 1229 1230 /* load 2 x int64_t elements */ 1231 u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0); 1232 1233 /* convert int64x2_t to float64x2_t */ 1234 f64x2 = vcvtq_f64_u64(u64x2); 1235 1236 /* convert float64x2_t to float32x2_t */ 1237 f32x2 = vcvt_f32_f64(f64x2); 1238 1239 /* subtract zero_point */ 1240 f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point)); 1241 1242 /* scale */ 1243 f32x2 = vmul_n_f32(f32x2, scale); 1244 1245 /* store float32x2_t lane 0 */ 1246 vst1_lane_f32(output, f32x2, 0); 1247 } 1248 1249 int 1250 rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 1251 uint64_t zero_point) 1252 { 1253 const uint64_t *input_buffer; 1254 float *output_buffer; 1255 uint64_t nb_iterations; 1256 uint32_t vlen; 1257 uint64_t i; 1258 1259 if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1260 return -EINVAL; 1261 1262 input_buffer = (const uint64_t *)input; 1263 output_buffer = (float *)output; 1264 vlen = 4 * sizeof(float) / sizeof(uint64_t); 1265 nb_iterations = nb_elements / vlen; 1266 1267 /* convert vlen elements in each iteration */ 1268 for (i = 0; i < nb_iterations; i++) { 1269 __uint64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point); 1270 input_buffer += vlen; 1271 output_buffer += vlen; 1272 } 1273 1274 /* convert leftover elements */ 1275 i = i * vlen; 1276 for (; i < nb_elements; i++) { 1277 __uint64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 1278 input_buffer++; 1279 output_buffer++; 1280 } 1281 1282 return 0; 1283 } 1284 1285 static inline void 1286 __float32_to_float16_neon_f16x4(const float32_t *input, float16_t *output) 1287 { 1288 float32x4_t f32x4; 1289 float16x4_t f16x4; 1290 1291 /* load 4 x float32_t elements */ 1292 f32x4 = vld1q_f32(input); 1293 1294 /* convert to float16x4_t */ 1295 f16x4 = vcvt_f16_f32(f32x4); 1296 1297 /* store float16x4_t */ 1298 vst1_f16(output, f16x4); 1299 } 1300 1301 static inline void 1302 __float32_to_float16_neon_f16x1(const float32_t *input, float16_t *output) 1303 { 1304 float32x4_t f32x4; 1305 float16x4_t f16x4; 1306 1307 /* load element to 4 lanes */ 1308 f32x4 = vld1q_dup_f32(input); 1309 1310 /* convert float32_t to float16_t */ 1311 f16x4 = vcvt_f16_f32(f32x4); 1312 1313 /* store lane 0 / 1 element */ 1314 vst1_lane_f16(output, f16x4, 0); 1315 } 1316 1317 int 1318 rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements) 1319 { 1320 const float32_t *input_buffer; 1321 float16_t *output_buffer; 1322 uint64_t nb_iterations; 1323 uint32_t vlen; 1324 uint64_t i; 1325 1326 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1327 return -EINVAL; 1328 1329 input_buffer = (const float32_t *)input; 1330 output_buffer = (float16_t *)output; 1331 vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1332 nb_iterations = nb_elements / vlen; 1333 1334 /* convert vlen elements in each iteration */ 1335 for (i = 0; i < nb_iterations; i++) { 1336 __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 1337 input_buffer += vlen; 1338 output_buffer += vlen; 1339 } 1340 1341 /* convert leftover elements */ 1342 i = i * vlen; 1343 for (; i < nb_elements; i++) { 1344 __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 1345 input_buffer++; 1346 output_buffer++; 1347 } 1348 1349 return 0; 1350 } 1351 1352 static inline void 1353 __float16_to_float32_neon_f32x4(const float16_t *input, float32_t *output) 1354 { 1355 float16x4_t f16x4; 1356 float32x4_t f32x4; 1357 1358 /* load 4 x float16_t elements */ 1359 f16x4 = vld1_f16(input); 1360 1361 /* convert float16x4_t to float32x4_t */ 1362 f32x4 = vcvt_f32_f16(f16x4); 1363 1364 /* store float32x4_t */ 1365 vst1q_f32(output, f32x4); 1366 } 1367 1368 static inline void 1369 __float16_to_float32_neon_f32x1(const float16_t *input, float32_t *output) 1370 { 1371 float16x4_t f16x4; 1372 float32x4_t f32x4; 1373 1374 /* load element to 4 lanes */ 1375 f16x4 = vld1_dup_f16(input); 1376 1377 /* convert float16_t to float32_t */ 1378 f32x4 = vcvt_f32_f16(f16x4); 1379 1380 /* store 1 element */ 1381 vst1q_lane_f32(output, f32x4, 0); 1382 } 1383 1384 int 1385 rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements) 1386 { 1387 const float16_t *input_buffer; 1388 float32_t *output_buffer; 1389 uint64_t nb_iterations; 1390 uint32_t vlen; 1391 uint64_t i; 1392 1393 if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1394 return -EINVAL; 1395 1396 input_buffer = (const float16_t *)input; 1397 output_buffer = (float32_t *)output; 1398 vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1399 nb_iterations = nb_elements / vlen; 1400 1401 /* convert vlen elements in each iteration */ 1402 for (i = 0; i < nb_iterations; i++) { 1403 __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 1404 input_buffer += vlen; 1405 output_buffer += vlen; 1406 } 1407 1408 /* convert leftover elements */ 1409 i = i * vlen; 1410 for (; i < nb_elements; i++) { 1411 __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 1412 input_buffer++; 1413 output_buffer++; 1414 } 1415 1416 return 0; 1417 } 1418