/* SPDX-License-Identifier: BSD-3-Clause * Copyright (c) 2022 Marvell. */ #include "mldev_utils_scalar.h" /* Description: * This file implements scalar versions of Machine Learning utility functions used to convert data * types from higher precision to lower precision and vice-versa, except bfloat16. */ int rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale, int8_t zero_point) { const float *input_buffer; int8_t *output_buffer; uint64_t i; int i32; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (int8_t *)output; for (i = 0; i < nb_elements; i++) { i32 = (int32_t)(round(*input_buffer / scale) + zero_point); if (i32 < INT8_MIN) i32 = INT8_MIN; if (i32 > INT8_MAX) i32 = INT8_MAX; *output_buffer = (int8_t)i32; input_buffer++; output_buffer++; } return 0; } int rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, int8_t zero_point) { const int8_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const int8_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale, uint8_t zero_point) { const float *input_buffer; uint8_t *output_buffer; int32_t i32; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (uint8_t *)output; for (i = 0; i < nb_elements; i++) { i32 = (int32_t)(round(*input_buffer / scale) + zero_point); if (i32 < 0) i32 = 0; if (i32 > UINT8_MAX) i32 = UINT8_MAX; *output_buffer = (uint8_t)i32; input_buffer++; output_buffer++; } return 0; } int rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, uint8_t zero_point) { const uint8_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const uint8_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale, int16_t zero_point) { const float *input_buffer; int16_t *output_buffer; int32_t i32; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (int16_t *)output; for (i = 0; i < nb_elements; i++) { i32 = (int32_t)(round(*input_buffer / scale) + zero_point); if (i32 < INT16_MIN) i32 = INT16_MIN; if (i32 > INT16_MAX) i32 = INT16_MAX; *output_buffer = (int16_t)i32; input_buffer++; output_buffer++; } return 0; } int rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, int16_t zero_point) { const int16_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const int16_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale, uint16_t zero_point) { const float *input_buffer; uint16_t *output_buffer; int32_t i32; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (uint16_t *)output; for (i = 0; i < nb_elements; i++) { i32 = (int32_t)(round(*input_buffer / scale) + zero_point); if (i32 < 0) i32 = 0; if (i32 > UINT16_MAX) i32 = UINT16_MAX; *output_buffer = (uint16_t)i32; input_buffer++; output_buffer++; } return 0; } int rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, uint16_t zero_point) { const uint16_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const uint16_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale, int32_t zero_point) { const float *input_buffer; int32_t *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (int32_t *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = (int32_t)(round(*input_buffer / scale) + zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, int32_t zero_point) { const int32_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const int32_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale, uint32_t zero_point) { const float *input_buffer; uint32_t *output_buffer; int32_t i32; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (uint32_t *)output; for (i = 0; i < nb_elements; i++) { i32 = (int32_t)(round(*input_buffer / scale) + zero_point); if (i32 < 0) i32 = 0; *output_buffer = (uint32_t)i32; input_buffer++; output_buffer++; } return 0; } int rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, uint32_t zero_point) { const uint32_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const uint32_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale, int64_t zero_point) { const float *input_buffer; int64_t *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (int64_t *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = (int64_t)(round(*input_buffer / scale) + zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, int64_t zero_point) { const int64_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const int64_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } int rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale, uint64_t zero_point) { const float *input_buffer; uint64_t *output_buffer; int64_t i64; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (uint64_t *)output; for (i = 0; i < nb_elements; i++) { i64 = (int64_t)(round(*input_buffer / scale) + zero_point); if (i64 < 0) i64 = 0; *output_buffer = (uint64_t)i64; input_buffer++; output_buffer++; } return 0; } int rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, uint64_t zero_point) { const uint64_t *input_buffer; float *output_buffer; uint64_t i; if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const uint64_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = scale * (float)(*input_buffer - zero_point); input_buffer++; output_buffer++; } return 0; } /* Convert a single precision floating point number (float32) into a half precision * floating point number (float16) using round to nearest rounding mode. */ static uint16_t __float32_to_float16_scalar_rtn(float x) { union float32 f32; /* float32 input */ uint32_t f32_s; /* float32 sign */ uint32_t f32_e; /* float32 exponent */ uint32_t f32_m; /* float32 mantissa */ uint16_t f16_s; /* float16 sign */ uint16_t f16_e; /* float16 exponent */ uint16_t f16_m; /* float16 mantissa */ uint32_t tbits; /* number of truncated bits */ uint32_t tmsb; /* MSB position of truncated bits */ uint32_t m_32; /* temporary float32 mantissa */ uint16_t m_16; /* temporary float16 mantissa */ uint16_t u16; /* float16 output */ int be_16; /* float16 biased exponent, signed */ f32.f = x; f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S; f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E; f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M; f16_s = f32_s; f16_e = 0; f16_m = 0; switch (f32_e) { case (0): /* float32: zero or subnormal number */ f16_e = 0; f16_m = 0; /* convert to zero */ break; case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */ f16_e = FP16_MASK_E >> FP16_LSB_E; if (f32_m == 0) { /* infinity */ f16_m = 0; } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */ f16_m = f32_m >> (FP32_MSB_M - FP16_MSB_M); f16_m |= BIT(FP16_MSB_M); } break; default: /* float32: normal number */ /* compute biased exponent for float16 */ be_16 = (int)f32_e - FP32_BIAS_E + FP16_BIAS_E; /* overflow, be_16 = [31-INF], set to infinity */ if (be_16 >= (int)(FP16_MASK_E >> FP16_LSB_E)) { f16_e = FP16_MASK_E >> FP16_LSB_E; f16_m = 0; } else if ((be_16 >= 1) && (be_16 < (int)(FP16_MASK_E >> FP16_LSB_E))) { /* normal float16, be_16 = [1:30]*/ f16_e = be_16; m_16 = f32_m >> (FP32_LSB_E - FP16_LSB_E); tmsb = FP32_MSB_M - FP16_MSB_M - 1; if ((f32_m & GENMASK_U32(tmsb, 0)) > BIT(tmsb)) { /* round: non-zero truncated bits except MSB */ m_16++; /* overflow into exponent */ if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) f16_e++; } else if ((f32_m & GENMASK_U32(tmsb, 0)) == BIT(tmsb)) { /* round: MSB of truncated bits and LSB of m_16 is set */ if ((m_16 & 0x1) == 0x1) { m_16++; /* overflow into exponent */ if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) f16_e++; } } f16_m = m_16 & FP16_MASK_M; } else if ((be_16 >= -(int)(FP16_MSB_M)) && (be_16 < 1)) { /* underflow: zero / subnormal, be_16 = [-9:0] */ f16_e = 0; /* add implicit leading zero */ m_32 = f32_m | BIT(FP32_LSB_E); tbits = FP32_LSB_E - FP16_LSB_E - be_16 + 1; m_16 = m_32 >> tbits; /* if non-leading truncated bits are set */ if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) { m_16++; /* overflow into exponent */ if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) f16_e++; } else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) { /* if leading truncated bit is set */ if ((m_16 & 0x1) == 0x1) { m_16++; /* overflow into exponent */ if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1) f16_e++; } } f16_m = m_16 & FP16_MASK_M; } else if (be_16 == -(int)(FP16_MSB_M + 1)) { /* underflow: zero, be_16 = [-10] */ f16_e = 0; if (f32_m != 0) f16_m = 1; else f16_m = 0; } else { /* underflow: zero, be_16 = [-INF:-11] */ f16_e = 0; f16_m = 0; } break; } u16 = FP16_PACK(f16_s, f16_e, f16_m); return u16; } int rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements) { const float *input_buffer; uint16_t *output_buffer; uint64_t i; if ((nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const float *)input; output_buffer = (uint16_t *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = __float32_to_float16_scalar_rtn(*input_buffer); input_buffer = input_buffer + 1; output_buffer = output_buffer + 1; } return 0; } /* Convert a half precision floating point number (float16) into a single precision * floating point number (float32). */ static float __float16_to_float32_scalar_rtx(uint16_t f16) { union float32 f32; /* float32 output */ uint16_t f16_s; /* float16 sign */ uint16_t f16_e; /* float16 exponent */ uint16_t f16_m; /* float16 mantissa */ uint32_t f32_s; /* float32 sign */ uint32_t f32_e; /* float32 exponent */ uint32_t f32_m; /* float32 mantissa*/ uint8_t shift; /* number of bits to be shifted */ uint32_t clz; /* count of leading zeroes */ int e_16; /* float16 exponent unbiased */ f16_s = (f16 & FP16_MASK_S) >> FP16_LSB_S; f16_e = (f16 & FP16_MASK_E) >> FP16_LSB_E; f16_m = (f16 & FP16_MASK_M) >> FP16_LSB_M; f32_s = f16_s; switch (f16_e) { case (FP16_MASK_E >> FP16_LSB_E): /* float16: infinity or nan */ f32_e = FP32_MASK_E >> FP32_LSB_E; if (f16_m == 0x0) { /* infinity */ f32_m = f16_m; } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */ f32_m = f16_m; shift = FP32_MSB_M - FP16_MSB_M; f32_m = (f32_m << shift) & FP32_MASK_M; f32_m |= BIT(FP32_MSB_M); } break; case 0: /* float16: zero or sub-normal */ f32_m = f16_m; if (f16_m == 0) { /* zero signed */ f32_e = 0; } else { /* subnormal numbers */ clz = rte_clz32((uint32_t)f16_m) - sizeof(uint32_t) * 8 + FP16_LSB_E; e_16 = (int)f16_e - clz; f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E; shift = clz + (FP32_MSB_M - FP16_MSB_M) + 1; f32_m = (f32_m << shift) & FP32_MASK_M; } break; default: /* normal numbers */ f32_m = f16_m; e_16 = (int)f16_e; f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E; shift = (FP32_MSB_M - FP16_MSB_M); f32_m = (f32_m << shift) & FP32_MASK_M; } f32.u = FP32_PACK(f32_s, f32_e, f32_m); return f32.f; } int rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements) { const uint16_t *input_buffer; float *output_buffer; uint64_t i; if ((nb_elements == 0) || (input == NULL) || (output == NULL)) return -EINVAL; input_buffer = (const uint16_t *)input; output_buffer = (float *)output; for (i = 0; i < nb_elements; i++) { *output_buffer = __float16_to_float32_scalar_rtx(*input_buffer); input_buffer = input_buffer + 1; output_buffer = output_buffer + 1; } return 0; }