1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause 2fc54766bSSrikanth Yalavarthi * Copyright (c) 2022 Marvell. 3fc54766bSSrikanth Yalavarthi */ 4fc54766bSSrikanth Yalavarthi 5fc54766bSSrikanth Yalavarthi #include <errno.h> 6fc54766bSSrikanth Yalavarthi #include <stdint.h> 7fc54766bSSrikanth Yalavarthi #include <stdlib.h> 8fc54766bSSrikanth Yalavarthi 9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h" 10fc54766bSSrikanth Yalavarthi 11fc54766bSSrikanth Yalavarthi #include <arm_neon.h> 12fc54766bSSrikanth Yalavarthi 13fc54766bSSrikanth Yalavarthi /* Description: 14fc54766bSSrikanth Yalavarthi * This file implements vector versions of Machine Learning utility functions used to convert data 15538f6997SSrikanth Yalavarthi * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16538f6997SSrikanth Yalavarthi * is based on Arm Neon intrinsics. 17fc54766bSSrikanth Yalavarthi */ 18fc54766bSSrikanth Yalavarthi 19fc54766bSSrikanth Yalavarthi static inline void 20*65282e9fSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(const float *input, int8_t *output, float scale, int8_t zero_point) 21fc54766bSSrikanth Yalavarthi { 22fc54766bSSrikanth Yalavarthi int16x4_t s16x4_l; 23fc54766bSSrikanth Yalavarthi int16x4_t s16x4_h; 24fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 25fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 26fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 27fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 28fc54766bSSrikanth Yalavarthi 29fc54766bSSrikanth Yalavarthi /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 31fc54766bSSrikanth Yalavarthi */ 32fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 33*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 34*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 35fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 36fc54766bSSrikanth Yalavarthi s16x4_l = vqmovn_s32(s32x4); 37fc54766bSSrikanth Yalavarthi 38fc54766bSSrikanth Yalavarthi /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 39fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 40fc54766bSSrikanth Yalavarthi */ 41fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 42*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 43*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 44fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 45fc54766bSSrikanth Yalavarthi s16x4_h = vqmovn_s32(s32x4); 46fc54766bSSrikanth Yalavarthi 47fc54766bSSrikanth Yalavarthi /* combine lower and higher int16x4_t to int16x8_t */ 48fc54766bSSrikanth Yalavarthi s16x8 = vcombine_s16(s16x4_l, s16x4_h); 49fc54766bSSrikanth Yalavarthi 50fc54766bSSrikanth Yalavarthi /* narrow to int8_t */ 51fc54766bSSrikanth Yalavarthi s8x8 = vqmovn_s16(s16x8); 52*65282e9fSSrikanth Yalavarthi s8x8 = vmax_s8(s8x8, vdup_n_s8(INT8_MIN + 1)); 53fc54766bSSrikanth Yalavarthi 54fc54766bSSrikanth Yalavarthi /* store 8 elements */ 55fc54766bSSrikanth Yalavarthi vst1_s8(output, s8x8); 56fc54766bSSrikanth Yalavarthi } 57fc54766bSSrikanth Yalavarthi 58fc54766bSSrikanth Yalavarthi static inline void 59*65282e9fSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(const float *input, int8_t *output, float scale, int8_t zero_point) 60fc54766bSSrikanth Yalavarthi { 61*65282e9fSSrikanth Yalavarthi float32x2_t f32x2; 62*65282e9fSSrikanth Yalavarthi int32x2_t s32x2; 63fc54766bSSrikanth Yalavarthi int16_t s16; 64fc54766bSSrikanth Yalavarthi 65fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 66*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 67*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 68*65282e9fSSrikanth Yalavarthi s32x2 = vcvta_s32_f32(f32x2); 69*65282e9fSSrikanth Yalavarthi s32x2 = vmax_s32(s32x2, vdup_n_s32(INT8_MIN + 1)); 70fc54766bSSrikanth Yalavarthi 71fc54766bSSrikanth Yalavarthi /* saturate narrow */ 72*65282e9fSSrikanth Yalavarthi s16 = vqmovns_s32(vget_lane_s32(s32x2, 0)); 73fc54766bSSrikanth Yalavarthi 74fc54766bSSrikanth Yalavarthi /* convert to int8_t */ 75fc54766bSSrikanth Yalavarthi *output = vqmovnh_s16(s16); 76fc54766bSSrikanth Yalavarthi } 77fc54766bSSrikanth Yalavarthi 78fc54766bSSrikanth Yalavarthi int 79*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale, 80*65282e9fSSrikanth Yalavarthi int8_t zero_point) 81fc54766bSSrikanth Yalavarthi { 82*65282e9fSSrikanth Yalavarthi const float *input_buffer; 83fc54766bSSrikanth Yalavarthi int8_t *output_buffer; 84fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 85fc54766bSSrikanth Yalavarthi uint32_t vlen; 86fc54766bSSrikanth Yalavarthi uint64_t i; 87fc54766bSSrikanth Yalavarthi 88fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 89fc54766bSSrikanth Yalavarthi return -EINVAL; 90fc54766bSSrikanth Yalavarthi 91*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 92fc54766bSSrikanth Yalavarthi output_buffer = (int8_t *)output; 93fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 94fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 95fc54766bSSrikanth Yalavarthi 96fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 97fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 98*65282e9fSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(input_buffer, output_buffer, scale, zero_point); 99fc54766bSSrikanth Yalavarthi input_buffer += vlen; 100fc54766bSSrikanth Yalavarthi output_buffer += vlen; 101fc54766bSSrikanth Yalavarthi } 102fc54766bSSrikanth Yalavarthi 103fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 104fc54766bSSrikanth Yalavarthi i = i * vlen; 105fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 106*65282e9fSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(input_buffer, output_buffer, scale, zero_point); 107fc54766bSSrikanth Yalavarthi input_buffer++; 108fc54766bSSrikanth Yalavarthi output_buffer++; 109fc54766bSSrikanth Yalavarthi } 110fc54766bSSrikanth Yalavarthi 111fc54766bSSrikanth Yalavarthi return 0; 112fc54766bSSrikanth Yalavarthi } 113fc54766bSSrikanth Yalavarthi 114fc54766bSSrikanth Yalavarthi static inline void 115*65282e9fSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(const int8_t *input, float *output, float scale, int8_t zero_point) 116fc54766bSSrikanth Yalavarthi { 117fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 118fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 119fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 120fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 121fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 122fc54766bSSrikanth Yalavarthi 123fc54766bSSrikanth Yalavarthi /* load 8 x int8_t elements */ 124fc54766bSSrikanth Yalavarthi s8x8 = vld1_s8(input); 125fc54766bSSrikanth Yalavarthi 126fc54766bSSrikanth Yalavarthi /* widen int8_t to int16_t */ 127fc54766bSSrikanth Yalavarthi s16x8 = vmovl_s8(s8x8); 128fc54766bSSrikanth Yalavarthi 129fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 130fc54766bSSrikanth Yalavarthi s16x4 = vget_low_s16(s16x8); 131fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 132fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 133*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 134fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 135fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 136fc54766bSSrikanth Yalavarthi 137fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 138fc54766bSSrikanth Yalavarthi s16x4 = vget_high_s16(s16x8); 139fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 140fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 141*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 142fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 143fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 144fc54766bSSrikanth Yalavarthi } 145fc54766bSSrikanth Yalavarthi 146fc54766bSSrikanth Yalavarthi static inline void 147*65282e9fSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(const int8_t *input, float *output, float scale, int8_t zero_point) 148fc54766bSSrikanth Yalavarthi { 149*65282e9fSSrikanth Yalavarthi *output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point); 150fc54766bSSrikanth Yalavarthi } 151fc54766bSSrikanth Yalavarthi 152fc54766bSSrikanth Yalavarthi int 153*65282e9fSSrikanth Yalavarthi rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 154*65282e9fSSrikanth Yalavarthi int8_t zero_point) 155fc54766bSSrikanth Yalavarthi { 156*65282e9fSSrikanth Yalavarthi const int8_t *input_buffer; 157fc54766bSSrikanth Yalavarthi float *output_buffer; 158fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 159fc54766bSSrikanth Yalavarthi uint32_t vlen; 160fc54766bSSrikanth Yalavarthi uint64_t i; 161fc54766bSSrikanth Yalavarthi 162fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 163fc54766bSSrikanth Yalavarthi return -EINVAL; 164fc54766bSSrikanth Yalavarthi 165*65282e9fSSrikanth Yalavarthi input_buffer = (const int8_t *)input; 166fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 167fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 168fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 169fc54766bSSrikanth Yalavarthi 170fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 171fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 172*65282e9fSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point); 173fc54766bSSrikanth Yalavarthi input_buffer += vlen; 174fc54766bSSrikanth Yalavarthi output_buffer += vlen; 175fc54766bSSrikanth Yalavarthi } 176fc54766bSSrikanth Yalavarthi 177fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 178fc54766bSSrikanth Yalavarthi i = i * vlen; 179fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 180*65282e9fSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 181fc54766bSSrikanth Yalavarthi input_buffer++; 182fc54766bSSrikanth Yalavarthi output_buffer++; 183fc54766bSSrikanth Yalavarthi } 184fc54766bSSrikanth Yalavarthi 185fc54766bSSrikanth Yalavarthi return 0; 186fc54766bSSrikanth Yalavarthi } 187fc54766bSSrikanth Yalavarthi 188fc54766bSSrikanth Yalavarthi static inline void 189*65282e9fSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(const float *input, uint8_t *output, float scale, uint8_t zero_point) 190fc54766bSSrikanth Yalavarthi { 191fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_l; 192fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_h; 193fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 194fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 195fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 196fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 197fc54766bSSrikanth Yalavarthi 198fc54766bSSrikanth Yalavarthi /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 199fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 200fc54766bSSrikanth Yalavarthi */ 201fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 202*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 203*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 204fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 205fc54766bSSrikanth Yalavarthi u16x4_l = vqmovn_u32(u32x4); 206fc54766bSSrikanth Yalavarthi 207fc54766bSSrikanth Yalavarthi /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 208fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 209fc54766bSSrikanth Yalavarthi */ 210fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 211*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 212*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 213fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 214fc54766bSSrikanth Yalavarthi u16x4_h = vqmovn_u32(u32x4); 215fc54766bSSrikanth Yalavarthi 216fc54766bSSrikanth Yalavarthi /* combine lower and higher uint16x4_t */ 217fc54766bSSrikanth Yalavarthi u16x8 = vcombine_u16(u16x4_l, u16x4_h); 218fc54766bSSrikanth Yalavarthi 219fc54766bSSrikanth Yalavarthi /* narrow to uint8x8_t */ 220fc54766bSSrikanth Yalavarthi u8x8 = vqmovn_u16(u16x8); 221fc54766bSSrikanth Yalavarthi 222fc54766bSSrikanth Yalavarthi /* store 8 elements */ 223fc54766bSSrikanth Yalavarthi vst1_u8(output, u8x8); 224fc54766bSSrikanth Yalavarthi } 225fc54766bSSrikanth Yalavarthi 226fc54766bSSrikanth Yalavarthi static inline void 227*65282e9fSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(const float *input, uint8_t *output, float scale, uint8_t zero_point) 228fc54766bSSrikanth Yalavarthi { 229*65282e9fSSrikanth Yalavarthi float32x2_t f32x2; 230*65282e9fSSrikanth Yalavarthi uint32x2_t u32x2; 231fc54766bSSrikanth Yalavarthi uint16_t u16; 232fc54766bSSrikanth Yalavarthi 233fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 234*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 235*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 236*65282e9fSSrikanth Yalavarthi u32x2 = vcvta_u32_f32(f32x2); 237fc54766bSSrikanth Yalavarthi 238fc54766bSSrikanth Yalavarthi /* saturate narrow */ 239*65282e9fSSrikanth Yalavarthi u16 = vqmovns_u32(vget_lane_u32(u32x2, 0)); 240fc54766bSSrikanth Yalavarthi 241fc54766bSSrikanth Yalavarthi /* convert to uint8_t */ 242fc54766bSSrikanth Yalavarthi *output = vqmovnh_u16(u16); 243fc54766bSSrikanth Yalavarthi } 244fc54766bSSrikanth Yalavarthi 245fc54766bSSrikanth Yalavarthi int 246*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale, 247*65282e9fSSrikanth Yalavarthi uint8_t zero_point) 248fc54766bSSrikanth Yalavarthi { 249*65282e9fSSrikanth Yalavarthi const float *input_buffer; 250fc54766bSSrikanth Yalavarthi uint8_t *output_buffer; 251fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 252fc54766bSSrikanth Yalavarthi uint32_t vlen; 253fc54766bSSrikanth Yalavarthi uint64_t i; 254fc54766bSSrikanth Yalavarthi 255fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 256fc54766bSSrikanth Yalavarthi return -EINVAL; 257fc54766bSSrikanth Yalavarthi 258*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 259fc54766bSSrikanth Yalavarthi output_buffer = (uint8_t *)output; 260fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 261fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 262fc54766bSSrikanth Yalavarthi 263fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 264fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 265*65282e9fSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(input_buffer, output_buffer, scale, zero_point); 266fc54766bSSrikanth Yalavarthi input_buffer += vlen; 267fc54766bSSrikanth Yalavarthi output_buffer += vlen; 268fc54766bSSrikanth Yalavarthi } 269fc54766bSSrikanth Yalavarthi 270fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 271fc54766bSSrikanth Yalavarthi i = i * vlen; 272fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 273*65282e9fSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(input_buffer, output_buffer, scale, zero_point); 274fc54766bSSrikanth Yalavarthi input_buffer++; 275fc54766bSSrikanth Yalavarthi output_buffer++; 276fc54766bSSrikanth Yalavarthi } 277fc54766bSSrikanth Yalavarthi 278fc54766bSSrikanth Yalavarthi return 0; 279fc54766bSSrikanth Yalavarthi } 280fc54766bSSrikanth Yalavarthi 281fc54766bSSrikanth Yalavarthi static inline void 282*65282e9fSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(const uint8_t *input, float *output, float scale, uint8_t zero_point) 283fc54766bSSrikanth Yalavarthi { 284fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 285fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 286*65282e9fSSrikanth Yalavarthi int16x8_t s16x8; 287*65282e9fSSrikanth Yalavarthi int16x4_t s16x4; 288*65282e9fSSrikanth Yalavarthi int32x4_t s32x4; 289fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 290fc54766bSSrikanth Yalavarthi 291fc54766bSSrikanth Yalavarthi /* load 8 x uint8_t elements */ 292fc54766bSSrikanth Yalavarthi u8x8 = vld1_u8(input); 293fc54766bSSrikanth Yalavarthi u16x8 = vmovl_u8(u8x8); 294*65282e9fSSrikanth Yalavarthi s16x8 = vreinterpretq_s16_u16(u16x8); 295fc54766bSSrikanth Yalavarthi 296fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 297*65282e9fSSrikanth Yalavarthi s16x4 = vget_low_s16(s16x8); 298*65282e9fSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 299*65282e9fSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 300*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 301fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 302fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 303fc54766bSSrikanth Yalavarthi 304fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 305*65282e9fSSrikanth Yalavarthi s16x4 = vget_high_s16(s16x8); 306*65282e9fSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 307*65282e9fSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 308*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 309fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 310fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 311fc54766bSSrikanth Yalavarthi } 312fc54766bSSrikanth Yalavarthi 313fc54766bSSrikanth Yalavarthi static inline void 314*65282e9fSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(const uint8_t *input, float *output, float scale, uint8_t zero_point) 315fc54766bSSrikanth Yalavarthi { 316*65282e9fSSrikanth Yalavarthi *output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point); 317fc54766bSSrikanth Yalavarthi } 318fc54766bSSrikanth Yalavarthi 319fc54766bSSrikanth Yalavarthi int 320*65282e9fSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 321*65282e9fSSrikanth Yalavarthi uint8_t zero_point) 322fc54766bSSrikanth Yalavarthi { 323*65282e9fSSrikanth Yalavarthi const uint8_t *input_buffer; 324fc54766bSSrikanth Yalavarthi float *output_buffer; 325fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 326fc54766bSSrikanth Yalavarthi uint64_t vlen; 327fc54766bSSrikanth Yalavarthi uint64_t i; 328fc54766bSSrikanth Yalavarthi 329fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 330fc54766bSSrikanth Yalavarthi return -EINVAL; 331fc54766bSSrikanth Yalavarthi 332*65282e9fSSrikanth Yalavarthi input_buffer = (const uint8_t *)input; 333fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 334fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 335fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 336fc54766bSSrikanth Yalavarthi 337fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 338fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 339*65282e9fSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point); 340fc54766bSSrikanth Yalavarthi input_buffer += vlen; 341fc54766bSSrikanth Yalavarthi output_buffer += vlen; 342fc54766bSSrikanth Yalavarthi } 343fc54766bSSrikanth Yalavarthi 344fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 345fc54766bSSrikanth Yalavarthi i = i * vlen; 346fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 347*65282e9fSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 348fc54766bSSrikanth Yalavarthi input_buffer++; 349fc54766bSSrikanth Yalavarthi output_buffer++; 350fc54766bSSrikanth Yalavarthi } 351fc54766bSSrikanth Yalavarthi 352fc54766bSSrikanth Yalavarthi return 0; 353fc54766bSSrikanth Yalavarthi } 354fc54766bSSrikanth Yalavarthi 355fc54766bSSrikanth Yalavarthi static inline void 356*65282e9fSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(const float *input, int16_t *output, float scale, int16_t zero_point) 357fc54766bSSrikanth Yalavarthi { 358fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 359fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 360fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 361fc54766bSSrikanth Yalavarthi 362fc54766bSSrikanth Yalavarthi /* load 4 x float elements */ 363fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 364fc54766bSSrikanth Yalavarthi 365fc54766bSSrikanth Yalavarthi /* scale */ 366*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 367*65282e9fSSrikanth Yalavarthi 368*65282e9fSSrikanth Yalavarthi /* add zero point */ 369*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 370fc54766bSSrikanth Yalavarthi 371fc54766bSSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 372fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 373fc54766bSSrikanth Yalavarthi 374fc54766bSSrikanth Yalavarthi /* saturate narrow to int16x4_t */ 375fc54766bSSrikanth Yalavarthi s16x4 = vqmovn_s32(s32x4); 376*65282e9fSSrikanth Yalavarthi s16x4 = vmax_s16(s16x4, vdup_n_s16(INT16_MIN + 1)); 377fc54766bSSrikanth Yalavarthi 378fc54766bSSrikanth Yalavarthi /* store 4 elements */ 379fc54766bSSrikanth Yalavarthi vst1_s16(output, s16x4); 380fc54766bSSrikanth Yalavarthi } 381fc54766bSSrikanth Yalavarthi 382fc54766bSSrikanth Yalavarthi static inline void 383*65282e9fSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(const float *input, int16_t *output, float scale, int16_t zero_point) 384fc54766bSSrikanth Yalavarthi { 385*65282e9fSSrikanth Yalavarthi float32x2_t f32x2; 386*65282e9fSSrikanth Yalavarthi int32x2_t s32x2; 387fc54766bSSrikanth Yalavarthi 388fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 389*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 390*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 391*65282e9fSSrikanth Yalavarthi s32x2 = vcvta_s32_f32(f32x2); 392*65282e9fSSrikanth Yalavarthi s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1)); 393fc54766bSSrikanth Yalavarthi 394fc54766bSSrikanth Yalavarthi /* saturate narrow */ 395*65282e9fSSrikanth Yalavarthi *output = vqmovns_s32(vget_lane_s32(s32x2, 0)); 396fc54766bSSrikanth Yalavarthi } 397fc54766bSSrikanth Yalavarthi 398fc54766bSSrikanth Yalavarthi int 399*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale, 400*65282e9fSSrikanth Yalavarthi int16_t zero_point) 401fc54766bSSrikanth Yalavarthi { 402*65282e9fSSrikanth Yalavarthi const float *input_buffer; 403fc54766bSSrikanth Yalavarthi int16_t *output_buffer; 404fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 405fc54766bSSrikanth Yalavarthi uint32_t vlen; 406fc54766bSSrikanth Yalavarthi uint64_t i; 407fc54766bSSrikanth Yalavarthi 408fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 409fc54766bSSrikanth Yalavarthi return -EINVAL; 410fc54766bSSrikanth Yalavarthi 411*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 412fc54766bSSrikanth Yalavarthi output_buffer = (int16_t *)output; 413fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 414fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 415fc54766bSSrikanth Yalavarthi 416fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 417fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 418*65282e9fSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(input_buffer, output_buffer, scale, zero_point); 419fc54766bSSrikanth Yalavarthi input_buffer += vlen; 420fc54766bSSrikanth Yalavarthi output_buffer += vlen; 421fc54766bSSrikanth Yalavarthi } 422fc54766bSSrikanth Yalavarthi 423fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 424fc54766bSSrikanth Yalavarthi i = i * vlen; 425fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 426*65282e9fSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(input_buffer, output_buffer, scale, zero_point); 427fc54766bSSrikanth Yalavarthi input_buffer++; 428fc54766bSSrikanth Yalavarthi output_buffer++; 429fc54766bSSrikanth Yalavarthi } 430fc54766bSSrikanth Yalavarthi 431fc54766bSSrikanth Yalavarthi return 0; 432fc54766bSSrikanth Yalavarthi } 433fc54766bSSrikanth Yalavarthi 434fc54766bSSrikanth Yalavarthi static inline void 435*65282e9fSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(const int16_t *input, float *output, float scale, int16_t zero_point) 436fc54766bSSrikanth Yalavarthi { 437fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 438fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 439fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 440fc54766bSSrikanth Yalavarthi 441fc54766bSSrikanth Yalavarthi /* load 4 x int16_t elements */ 442fc54766bSSrikanth Yalavarthi s16x4 = vld1_s16(input); 443fc54766bSSrikanth Yalavarthi 444fc54766bSSrikanth Yalavarthi /* widen int16_t to int32_t */ 445fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 446fc54766bSSrikanth Yalavarthi 447fc54766bSSrikanth Yalavarthi /* convert int32_t to float */ 448fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 449fc54766bSSrikanth Yalavarthi 450*65282e9fSSrikanth Yalavarthi /* subtract zero point */ 451*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 452*65282e9fSSrikanth Yalavarthi 453fc54766bSSrikanth Yalavarthi /* scale */ 454fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 455fc54766bSSrikanth Yalavarthi 456fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 457fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 458fc54766bSSrikanth Yalavarthi } 459fc54766bSSrikanth Yalavarthi 460fc54766bSSrikanth Yalavarthi static inline void 461*65282e9fSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(const int16_t *input, float *output, float scale, int16_t zero_point) 462fc54766bSSrikanth Yalavarthi { 463*65282e9fSSrikanth Yalavarthi *output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point); 464fc54766bSSrikanth Yalavarthi } 465fc54766bSSrikanth Yalavarthi 466fc54766bSSrikanth Yalavarthi int 467*65282e9fSSrikanth Yalavarthi rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 468*65282e9fSSrikanth Yalavarthi int16_t zero_point) 469fc54766bSSrikanth Yalavarthi { 470*65282e9fSSrikanth Yalavarthi const int16_t *input_buffer; 471fc54766bSSrikanth Yalavarthi float *output_buffer; 472fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 473fc54766bSSrikanth Yalavarthi uint32_t vlen; 474fc54766bSSrikanth Yalavarthi uint64_t i; 475fc54766bSSrikanth Yalavarthi 476fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 477fc54766bSSrikanth Yalavarthi return -EINVAL; 478fc54766bSSrikanth Yalavarthi 479*65282e9fSSrikanth Yalavarthi input_buffer = (const int16_t *)input; 480fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 481fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 482fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 483fc54766bSSrikanth Yalavarthi 484fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 485fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 486*65282e9fSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 487fc54766bSSrikanth Yalavarthi input_buffer += vlen; 488fc54766bSSrikanth Yalavarthi output_buffer += vlen; 489fc54766bSSrikanth Yalavarthi } 490fc54766bSSrikanth Yalavarthi 491fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 492fc54766bSSrikanth Yalavarthi i = i * vlen; 493fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 494*65282e9fSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 495fc54766bSSrikanth Yalavarthi input_buffer++; 496fc54766bSSrikanth Yalavarthi output_buffer++; 497fc54766bSSrikanth Yalavarthi } 498fc54766bSSrikanth Yalavarthi 499fc54766bSSrikanth Yalavarthi return 0; 500fc54766bSSrikanth Yalavarthi } 501fc54766bSSrikanth Yalavarthi 502fc54766bSSrikanth Yalavarthi static inline void 503*65282e9fSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(const float *input, uint16_t *output, float scale, 504*65282e9fSSrikanth Yalavarthi uint16_t zero_point) 505fc54766bSSrikanth Yalavarthi { 506fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 507fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 508fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 509fc54766bSSrikanth Yalavarthi 510fc54766bSSrikanth Yalavarthi /* load 4 float elements */ 511fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 512fc54766bSSrikanth Yalavarthi 513fc54766bSSrikanth Yalavarthi /* scale */ 514*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 515*65282e9fSSrikanth Yalavarthi 516*65282e9fSSrikanth Yalavarthi /* add zero point */ 517*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 518fc54766bSSrikanth Yalavarthi 519fc54766bSSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 520fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 521fc54766bSSrikanth Yalavarthi 522fc54766bSSrikanth Yalavarthi /* saturate narrow */ 523fc54766bSSrikanth Yalavarthi u16x4 = vqmovn_u32(u32x4); 524fc54766bSSrikanth Yalavarthi 525fc54766bSSrikanth Yalavarthi /* store 4 elements */ 526fc54766bSSrikanth Yalavarthi vst1_u16(output, u16x4); 527fc54766bSSrikanth Yalavarthi } 528fc54766bSSrikanth Yalavarthi 529fc54766bSSrikanth Yalavarthi static inline void 530*65282e9fSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(const float *input, uint16_t *output, float scale, 531*65282e9fSSrikanth Yalavarthi uint16_t zero_point) 532fc54766bSSrikanth Yalavarthi { 533fc54766bSSrikanth Yalavarthi uint32_t u32; 534fc54766bSSrikanth Yalavarthi 535fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 536*65282e9fSSrikanth Yalavarthi u32 = vcvtas_u32_f32((*input) / scale + (float)zero_point); 537fc54766bSSrikanth Yalavarthi 538fc54766bSSrikanth Yalavarthi /* saturate narrow */ 539*65282e9fSSrikanth Yalavarthi *output = vqmovns_u32(u32) + zero_point; 540fc54766bSSrikanth Yalavarthi } 541fc54766bSSrikanth Yalavarthi 542fc54766bSSrikanth Yalavarthi int 543*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale, 544*65282e9fSSrikanth Yalavarthi uint16_t zero_point) 545fc54766bSSrikanth Yalavarthi { 546*65282e9fSSrikanth Yalavarthi const float *input_buffer; 547fc54766bSSrikanth Yalavarthi uint16_t *output_buffer; 548fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 549fc54766bSSrikanth Yalavarthi uint64_t vlen; 550fc54766bSSrikanth Yalavarthi uint64_t i; 551fc54766bSSrikanth Yalavarthi 552fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 553fc54766bSSrikanth Yalavarthi return -EINVAL; 554fc54766bSSrikanth Yalavarthi 555*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 556fc54766bSSrikanth Yalavarthi output_buffer = (uint16_t *)output; 557fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 558fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 559fc54766bSSrikanth Yalavarthi 560fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 561fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 562*65282e9fSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(input_buffer, output_buffer, scale, zero_point); 563fc54766bSSrikanth Yalavarthi input_buffer += vlen; 564fc54766bSSrikanth Yalavarthi output_buffer += vlen; 565fc54766bSSrikanth Yalavarthi } 566fc54766bSSrikanth Yalavarthi 567fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 568fc54766bSSrikanth Yalavarthi i = i * vlen; 569fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 570*65282e9fSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(input_buffer, output_buffer, scale, zero_point); 571fc54766bSSrikanth Yalavarthi input_buffer++; 572fc54766bSSrikanth Yalavarthi output_buffer++; 573fc54766bSSrikanth Yalavarthi } 574fc54766bSSrikanth Yalavarthi 575fc54766bSSrikanth Yalavarthi return 0; 576fc54766bSSrikanth Yalavarthi } 577fc54766bSSrikanth Yalavarthi 578fc54766bSSrikanth Yalavarthi static inline void 579*65282e9fSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(const uint16_t *input, float *output, float scale, 580*65282e9fSSrikanth Yalavarthi uint16_t zero_point) 581fc54766bSSrikanth Yalavarthi { 582fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 583fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 584fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 585fc54766bSSrikanth Yalavarthi 586fc54766bSSrikanth Yalavarthi /* load 4 x uint16_t elements */ 587fc54766bSSrikanth Yalavarthi u16x4 = vld1_u16(input); 588fc54766bSSrikanth Yalavarthi 589fc54766bSSrikanth Yalavarthi /* widen uint16_t to uint32_t */ 590fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 591fc54766bSSrikanth Yalavarthi 592fc54766bSSrikanth Yalavarthi /* convert uint32_t to float */ 593fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 594fc54766bSSrikanth Yalavarthi 595*65282e9fSSrikanth Yalavarthi /* subtract zero point */ 596*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 597*65282e9fSSrikanth Yalavarthi 598fc54766bSSrikanth Yalavarthi /* scale */ 599fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 600fc54766bSSrikanth Yalavarthi 601fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 602fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 603fc54766bSSrikanth Yalavarthi } 604fc54766bSSrikanth Yalavarthi 605fc54766bSSrikanth Yalavarthi static inline void 606*65282e9fSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(const uint16_t *input, float *output, float scale, 607*65282e9fSSrikanth Yalavarthi uint16_t zero_point) 608fc54766bSSrikanth Yalavarthi { 609*65282e9fSSrikanth Yalavarthi *output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point); 610fc54766bSSrikanth Yalavarthi } 611fc54766bSSrikanth Yalavarthi 612fc54766bSSrikanth Yalavarthi int 613*65282e9fSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 614*65282e9fSSrikanth Yalavarthi uint16_t zero_point) 615fc54766bSSrikanth Yalavarthi { 616*65282e9fSSrikanth Yalavarthi const uint16_t *input_buffer; 617fc54766bSSrikanth Yalavarthi float *output_buffer; 618fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 619fc54766bSSrikanth Yalavarthi uint32_t vlen; 620fc54766bSSrikanth Yalavarthi uint64_t i; 621fc54766bSSrikanth Yalavarthi 622fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 623fc54766bSSrikanth Yalavarthi return -EINVAL; 624fc54766bSSrikanth Yalavarthi 625*65282e9fSSrikanth Yalavarthi input_buffer = (const uint16_t *)input; 626fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 627fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 628fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 629fc54766bSSrikanth Yalavarthi 630fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 631fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 632*65282e9fSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 633fc54766bSSrikanth Yalavarthi input_buffer += vlen; 634fc54766bSSrikanth Yalavarthi output_buffer += vlen; 635fc54766bSSrikanth Yalavarthi } 636fc54766bSSrikanth Yalavarthi 637fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 638fc54766bSSrikanth Yalavarthi i = i * vlen; 639fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 640*65282e9fSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 641fc54766bSSrikanth Yalavarthi input_buffer++; 642fc54766bSSrikanth Yalavarthi output_buffer++; 643fc54766bSSrikanth Yalavarthi } 644fc54766bSSrikanth Yalavarthi 645fc54766bSSrikanth Yalavarthi return 0; 646fc54766bSSrikanth Yalavarthi } 647fc54766bSSrikanth Yalavarthi 648fc54766bSSrikanth Yalavarthi static inline void 649*65282e9fSSrikanth Yalavarthi __float32_to_int32_neon_s32x4(const float *input, int32_t *output, float scale, int32_t zero_point) 65050513ae5SSrikanth Yalavarthi { 65150513ae5SSrikanth Yalavarthi float32x4_t f32x4; 65250513ae5SSrikanth Yalavarthi int32x4_t s32x4; 65350513ae5SSrikanth Yalavarthi 65450513ae5SSrikanth Yalavarthi /* load 4 x float elements */ 65550513ae5SSrikanth Yalavarthi f32x4 = vld1q_f32(input); 65650513ae5SSrikanth Yalavarthi 65750513ae5SSrikanth Yalavarthi /* scale */ 658*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 659*65282e9fSSrikanth Yalavarthi 660*65282e9fSSrikanth Yalavarthi /* add zero point */ 661*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 66250513ae5SSrikanth Yalavarthi 66350513ae5SSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 66450513ae5SSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 66550513ae5SSrikanth Yalavarthi 666*65282e9fSSrikanth Yalavarthi /* add zero_point */ 667*65282e9fSSrikanth Yalavarthi s32x4 = vaddq_s32(s32x4, vdupq_n_s32(zero_point)); 668*65282e9fSSrikanth Yalavarthi s32x4 = vmaxq_s32(s32x4, vdupq_n_s32(INT32_MIN + 1)); 669*65282e9fSSrikanth Yalavarthi 67050513ae5SSrikanth Yalavarthi /* store 4 elements */ 67150513ae5SSrikanth Yalavarthi vst1q_s32(output, s32x4); 67250513ae5SSrikanth Yalavarthi } 67350513ae5SSrikanth Yalavarthi 67450513ae5SSrikanth Yalavarthi static inline void 675*65282e9fSSrikanth Yalavarthi __float32_to_int32_neon_s32x1(const float *input, int32_t *output, float scale, int32_t zero_point) 67650513ae5SSrikanth Yalavarthi { 677*65282e9fSSrikanth Yalavarthi float32x2_t f32x2; 678*65282e9fSSrikanth Yalavarthi int32x2_t s32x2; 679*65282e9fSSrikanth Yalavarthi 68050513ae5SSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 681*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale)); 682*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 683*65282e9fSSrikanth Yalavarthi s32x2 = vcvta_s32_f32(f32x2); 684*65282e9fSSrikanth Yalavarthi s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1)); 685*65282e9fSSrikanth Yalavarthi 686*65282e9fSSrikanth Yalavarthi /* saturate narrow */ 687*65282e9fSSrikanth Yalavarthi vst1_lane_s32(output, s32x2, 0); 68850513ae5SSrikanth Yalavarthi } 68950513ae5SSrikanth Yalavarthi 69050513ae5SSrikanth Yalavarthi int 691*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale, 692*65282e9fSSrikanth Yalavarthi int32_t zero_point) 69350513ae5SSrikanth Yalavarthi { 694*65282e9fSSrikanth Yalavarthi const float *input_buffer; 69550513ae5SSrikanth Yalavarthi int32_t *output_buffer; 69650513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 69750513ae5SSrikanth Yalavarthi uint32_t vlen; 69850513ae5SSrikanth Yalavarthi uint64_t i; 69950513ae5SSrikanth Yalavarthi 70050513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 70150513ae5SSrikanth Yalavarthi return -EINVAL; 70250513ae5SSrikanth Yalavarthi 703*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 70450513ae5SSrikanth Yalavarthi output_buffer = (int32_t *)output; 70550513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int32_t); 70650513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 70750513ae5SSrikanth Yalavarthi 70850513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 70950513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 710*65282e9fSSrikanth Yalavarthi __float32_to_int32_neon_s32x4(input_buffer, output_buffer, scale, zero_point); 71150513ae5SSrikanth Yalavarthi input_buffer += vlen; 71250513ae5SSrikanth Yalavarthi output_buffer += vlen; 71350513ae5SSrikanth Yalavarthi } 71450513ae5SSrikanth Yalavarthi 71550513ae5SSrikanth Yalavarthi /* convert leftover elements */ 71650513ae5SSrikanth Yalavarthi i = i * vlen; 71750513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 718*65282e9fSSrikanth Yalavarthi __float32_to_int32_neon_s32x1(input_buffer, output_buffer, scale, zero_point); 71950513ae5SSrikanth Yalavarthi input_buffer++; 72050513ae5SSrikanth Yalavarthi output_buffer++; 72150513ae5SSrikanth Yalavarthi } 72250513ae5SSrikanth Yalavarthi 72350513ae5SSrikanth Yalavarthi return 0; 72450513ae5SSrikanth Yalavarthi } 72550513ae5SSrikanth Yalavarthi 72650513ae5SSrikanth Yalavarthi static inline void 727*65282e9fSSrikanth Yalavarthi __int32_to_float32_neon_f32x4(const int32_t *input, float *output, float scale, int32_t zero_point) 72850513ae5SSrikanth Yalavarthi { 72950513ae5SSrikanth Yalavarthi float32x4_t f32x4; 73050513ae5SSrikanth Yalavarthi int32x4_t s32x4; 73150513ae5SSrikanth Yalavarthi 73250513ae5SSrikanth Yalavarthi /* load 4 x int32_t elements */ 73350513ae5SSrikanth Yalavarthi s32x4 = vld1q_s32(input); 73450513ae5SSrikanth Yalavarthi 73550513ae5SSrikanth Yalavarthi /* convert int32_t to float */ 73650513ae5SSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 73750513ae5SSrikanth Yalavarthi 738*65282e9fSSrikanth Yalavarthi /* subtract zero point */ 739*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 740*65282e9fSSrikanth Yalavarthi 74150513ae5SSrikanth Yalavarthi /* scale */ 74250513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 74350513ae5SSrikanth Yalavarthi 74450513ae5SSrikanth Yalavarthi /* store float32x4_t */ 74550513ae5SSrikanth Yalavarthi vst1q_f32(output, f32x4); 74650513ae5SSrikanth Yalavarthi } 74750513ae5SSrikanth Yalavarthi 74850513ae5SSrikanth Yalavarthi static inline void 749*65282e9fSSrikanth Yalavarthi __int32_to_float32_neon_f32x1(const int32_t *input, float *output, float scale, int32_t zero_point) 75050513ae5SSrikanth Yalavarthi { 751*65282e9fSSrikanth Yalavarthi *output = scale * (vcvts_f32_s32(*input) - (float)zero_point); 75250513ae5SSrikanth Yalavarthi } 75350513ae5SSrikanth Yalavarthi 75450513ae5SSrikanth Yalavarthi int 755*65282e9fSSrikanth Yalavarthi rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 756*65282e9fSSrikanth Yalavarthi int32_t zero_point) 75750513ae5SSrikanth Yalavarthi { 758*65282e9fSSrikanth Yalavarthi const int32_t *input_buffer; 75950513ae5SSrikanth Yalavarthi float *output_buffer; 76050513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 76150513ae5SSrikanth Yalavarthi uint32_t vlen; 76250513ae5SSrikanth Yalavarthi uint64_t i; 76350513ae5SSrikanth Yalavarthi 76450513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 76550513ae5SSrikanth Yalavarthi return -EINVAL; 76650513ae5SSrikanth Yalavarthi 767*65282e9fSSrikanth Yalavarthi input_buffer = (const int32_t *)input; 76850513ae5SSrikanth Yalavarthi output_buffer = (float *)output; 76950513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int32_t); 77050513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 77150513ae5SSrikanth Yalavarthi 77250513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 77350513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 774*65282e9fSSrikanth Yalavarthi __int32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 77550513ae5SSrikanth Yalavarthi input_buffer += vlen; 77650513ae5SSrikanth Yalavarthi output_buffer += vlen; 77750513ae5SSrikanth Yalavarthi } 77850513ae5SSrikanth Yalavarthi 77950513ae5SSrikanth Yalavarthi /* convert leftover elements */ 78050513ae5SSrikanth Yalavarthi i = i * vlen; 78150513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 782*65282e9fSSrikanth Yalavarthi __int32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 78350513ae5SSrikanth Yalavarthi input_buffer++; 78450513ae5SSrikanth Yalavarthi output_buffer++; 78550513ae5SSrikanth Yalavarthi } 78650513ae5SSrikanth Yalavarthi 78750513ae5SSrikanth Yalavarthi return 0; 78850513ae5SSrikanth Yalavarthi } 78950513ae5SSrikanth Yalavarthi 79050513ae5SSrikanth Yalavarthi static inline void 791*65282e9fSSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(const float *input, uint32_t *output, float scale, 792*65282e9fSSrikanth Yalavarthi uint32_t zero_point) 79350513ae5SSrikanth Yalavarthi { 79450513ae5SSrikanth Yalavarthi float32x4_t f32x4; 79550513ae5SSrikanth Yalavarthi uint32x4_t u32x4; 79650513ae5SSrikanth Yalavarthi 79750513ae5SSrikanth Yalavarthi /* load 4 float elements */ 79850513ae5SSrikanth Yalavarthi f32x4 = vld1q_f32(input); 79950513ae5SSrikanth Yalavarthi 80050513ae5SSrikanth Yalavarthi /* scale */ 801*65282e9fSSrikanth Yalavarthi f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale)); 802*65282e9fSSrikanth Yalavarthi 803*65282e9fSSrikanth Yalavarthi /* add zero point */ 804*65282e9fSSrikanth Yalavarthi f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point)); 80550513ae5SSrikanth Yalavarthi 80650513ae5SSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 80750513ae5SSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 80850513ae5SSrikanth Yalavarthi 80950513ae5SSrikanth Yalavarthi /* store 4 elements */ 81050513ae5SSrikanth Yalavarthi vst1q_u32(output, u32x4); 81150513ae5SSrikanth Yalavarthi } 81250513ae5SSrikanth Yalavarthi 81350513ae5SSrikanth Yalavarthi static inline void 814*65282e9fSSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(const float *input, uint32_t *output, float scale, 815*65282e9fSSrikanth Yalavarthi uint32_t zero_point) 81650513ae5SSrikanth Yalavarthi { 81750513ae5SSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 818*65282e9fSSrikanth Yalavarthi *output = vcvtas_u32_f32((*input) / scale + (float)zero_point); 81950513ae5SSrikanth Yalavarthi } 82050513ae5SSrikanth Yalavarthi 82150513ae5SSrikanth Yalavarthi int 822*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale, 823*65282e9fSSrikanth Yalavarthi uint32_t zero_point) 82450513ae5SSrikanth Yalavarthi { 825*65282e9fSSrikanth Yalavarthi const float *input_buffer; 82650513ae5SSrikanth Yalavarthi uint32_t *output_buffer; 82750513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 82850513ae5SSrikanth Yalavarthi uint64_t vlen; 82950513ae5SSrikanth Yalavarthi uint64_t i; 83050513ae5SSrikanth Yalavarthi 83150513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 83250513ae5SSrikanth Yalavarthi return -EINVAL; 83350513ae5SSrikanth Yalavarthi 834*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 83550513ae5SSrikanth Yalavarthi output_buffer = (uint32_t *)output; 83650513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint32_t); 83750513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 83850513ae5SSrikanth Yalavarthi 83950513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 84050513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 841*65282e9fSSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(input_buffer, output_buffer, scale, zero_point); 84250513ae5SSrikanth Yalavarthi input_buffer += vlen; 84350513ae5SSrikanth Yalavarthi output_buffer += vlen; 84450513ae5SSrikanth Yalavarthi } 84550513ae5SSrikanth Yalavarthi 84650513ae5SSrikanth Yalavarthi /* convert leftover elements */ 84750513ae5SSrikanth Yalavarthi i = i * vlen; 84850513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 849*65282e9fSSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(input_buffer, output_buffer, scale, zero_point); 85050513ae5SSrikanth Yalavarthi input_buffer++; 85150513ae5SSrikanth Yalavarthi output_buffer++; 85250513ae5SSrikanth Yalavarthi } 85350513ae5SSrikanth Yalavarthi 85450513ae5SSrikanth Yalavarthi return 0; 85550513ae5SSrikanth Yalavarthi } 85650513ae5SSrikanth Yalavarthi 85750513ae5SSrikanth Yalavarthi static inline void 858*65282e9fSSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(const uint32_t *input, float *output, float scale, 859*65282e9fSSrikanth Yalavarthi uint32_t zero_point) 86050513ae5SSrikanth Yalavarthi { 86150513ae5SSrikanth Yalavarthi float32x4_t f32x4; 86250513ae5SSrikanth Yalavarthi uint32x4_t u32x4; 86350513ae5SSrikanth Yalavarthi 86450513ae5SSrikanth Yalavarthi /* load 4 x uint32_t elements */ 86550513ae5SSrikanth Yalavarthi u32x4 = vld1q_u32(input); 86650513ae5SSrikanth Yalavarthi 86750513ae5SSrikanth Yalavarthi /* convert uint32_t to float */ 86850513ae5SSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 86950513ae5SSrikanth Yalavarthi 870*65282e9fSSrikanth Yalavarthi /* subtract zero point */ 871*65282e9fSSrikanth Yalavarthi f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point)); 872*65282e9fSSrikanth Yalavarthi 87350513ae5SSrikanth Yalavarthi /* scale */ 87450513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 87550513ae5SSrikanth Yalavarthi 87650513ae5SSrikanth Yalavarthi /* store float32x4_t */ 87750513ae5SSrikanth Yalavarthi vst1q_f32(output, f32x4); 87850513ae5SSrikanth Yalavarthi } 87950513ae5SSrikanth Yalavarthi 88050513ae5SSrikanth Yalavarthi static inline void 881*65282e9fSSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(const uint32_t *input, float *output, float scale, 882*65282e9fSSrikanth Yalavarthi uint32_t zero_point) 88350513ae5SSrikanth Yalavarthi { 884*65282e9fSSrikanth Yalavarthi *output = scale * (vcvts_f32_u32(*input) - (float)zero_point); 88550513ae5SSrikanth Yalavarthi } 88650513ae5SSrikanth Yalavarthi 88750513ae5SSrikanth Yalavarthi int 888*65282e9fSSrikanth Yalavarthi rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 889*65282e9fSSrikanth Yalavarthi uint32_t zero_point) 89050513ae5SSrikanth Yalavarthi { 891*65282e9fSSrikanth Yalavarthi const uint32_t *input_buffer; 89250513ae5SSrikanth Yalavarthi float *output_buffer; 89350513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 89450513ae5SSrikanth Yalavarthi uint32_t vlen; 89550513ae5SSrikanth Yalavarthi uint64_t i; 89650513ae5SSrikanth Yalavarthi 89750513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 89850513ae5SSrikanth Yalavarthi return -EINVAL; 89950513ae5SSrikanth Yalavarthi 900*65282e9fSSrikanth Yalavarthi input_buffer = (const uint32_t *)input; 90150513ae5SSrikanth Yalavarthi output_buffer = (float *)output; 90250513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint32_t); 90350513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 90450513ae5SSrikanth Yalavarthi 90550513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 90650513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 907*65282e9fSSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point); 90850513ae5SSrikanth Yalavarthi input_buffer += vlen; 90950513ae5SSrikanth Yalavarthi output_buffer += vlen; 91050513ae5SSrikanth Yalavarthi } 91150513ae5SSrikanth Yalavarthi 91250513ae5SSrikanth Yalavarthi /* convert leftover elements */ 91350513ae5SSrikanth Yalavarthi i = i * vlen; 91450513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 915*65282e9fSSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 91650513ae5SSrikanth Yalavarthi input_buffer++; 91750513ae5SSrikanth Yalavarthi output_buffer++; 91850513ae5SSrikanth Yalavarthi } 91950513ae5SSrikanth Yalavarthi 92050513ae5SSrikanth Yalavarthi return 0; 92150513ae5SSrikanth Yalavarthi } 92250513ae5SSrikanth Yalavarthi 92350513ae5SSrikanth Yalavarthi static inline void 924*65282e9fSSrikanth Yalavarthi __float32_to_int64_neon_s64x2(const float *input, int64_t *output, float scale, int64_t zero_point) 92542f3dcd9SSrikanth Yalavarthi { 92642f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 92742f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 92842f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 929*65282e9fSSrikanth Yalavarthi int64_t s64; 93042f3dcd9SSrikanth Yalavarthi 93142f3dcd9SSrikanth Yalavarthi /* load 2 x float elements */ 93242f3dcd9SSrikanth Yalavarthi f32x2 = vld1_f32(input); 93342f3dcd9SSrikanth Yalavarthi 93442f3dcd9SSrikanth Yalavarthi /* scale */ 935*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 936*65282e9fSSrikanth Yalavarthi 937*65282e9fSSrikanth Yalavarthi /* add zero point */ 938*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 93942f3dcd9SSrikanth Yalavarthi 94042f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 94142f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 94242f3dcd9SSrikanth Yalavarthi 94342f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 94442f3dcd9SSrikanth Yalavarthi s64x2 = vcvtaq_s64_f64(f64x2); 945*65282e9fSSrikanth Yalavarthi s64 = vgetq_lane_s64(s64x2, 0); 946*65282e9fSSrikanth Yalavarthi s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64; 94742f3dcd9SSrikanth Yalavarthi 948*65282e9fSSrikanth Yalavarthi /* store lane 0 of int64x2_t */ 949*65282e9fSSrikanth Yalavarthi *output = s64; 95042f3dcd9SSrikanth Yalavarthi } 95142f3dcd9SSrikanth Yalavarthi 95242f3dcd9SSrikanth Yalavarthi static inline void 953*65282e9fSSrikanth Yalavarthi __float32_to_int64_neon_s64x1(const float *input, int64_t *output, float scale, int64_t zero_point) 95442f3dcd9SSrikanth Yalavarthi { 95542f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 95642f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 95742f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 958*65282e9fSSrikanth Yalavarthi int64_t s64; 95942f3dcd9SSrikanth Yalavarthi 96042f3dcd9SSrikanth Yalavarthi /* load 1 x float element */ 96142f3dcd9SSrikanth Yalavarthi f32x2 = vdup_n_f32(*input); 96242f3dcd9SSrikanth Yalavarthi 96342f3dcd9SSrikanth Yalavarthi /* scale */ 964*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 965*65282e9fSSrikanth Yalavarthi 966*65282e9fSSrikanth Yalavarthi /* add zero point */ 967*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 96842f3dcd9SSrikanth Yalavarthi 96942f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 97042f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 97142f3dcd9SSrikanth Yalavarthi 97242f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 97342f3dcd9SSrikanth Yalavarthi s64x2 = vcvtaq_s64_f64(f64x2); 974*65282e9fSSrikanth Yalavarthi s64 = vgetq_lane_s64(s64x2, 0); 975*65282e9fSSrikanth Yalavarthi s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64; 97642f3dcd9SSrikanth Yalavarthi 97742f3dcd9SSrikanth Yalavarthi /* store lane 0 of int64x2_t */ 978*65282e9fSSrikanth Yalavarthi *output = s64; 97942f3dcd9SSrikanth Yalavarthi } 98042f3dcd9SSrikanth Yalavarthi 98142f3dcd9SSrikanth Yalavarthi int 982*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale, 983*65282e9fSSrikanth Yalavarthi int64_t zero_point) 98442f3dcd9SSrikanth Yalavarthi { 985*65282e9fSSrikanth Yalavarthi const float *input_buffer; 98642f3dcd9SSrikanth Yalavarthi int64_t *output_buffer; 98742f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 98842f3dcd9SSrikanth Yalavarthi uint32_t vlen; 98942f3dcd9SSrikanth Yalavarthi uint64_t i; 99042f3dcd9SSrikanth Yalavarthi 99142f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 99242f3dcd9SSrikanth Yalavarthi return -EINVAL; 99342f3dcd9SSrikanth Yalavarthi 994*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 99542f3dcd9SSrikanth Yalavarthi output_buffer = (int64_t *)output; 99642f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(int64_t); 99742f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 99842f3dcd9SSrikanth Yalavarthi 99942f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 100042f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1001*65282e9fSSrikanth Yalavarthi __float32_to_int64_neon_s64x2(input_buffer, output_buffer, scale, zero_point); 100242f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 100342f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 100442f3dcd9SSrikanth Yalavarthi } 100542f3dcd9SSrikanth Yalavarthi 100642f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 100742f3dcd9SSrikanth Yalavarthi i = i * vlen; 100842f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 1009*65282e9fSSrikanth Yalavarthi __float32_to_int64_neon_s64x1(input_buffer, output_buffer, scale, zero_point); 101042f3dcd9SSrikanth Yalavarthi input_buffer++; 101142f3dcd9SSrikanth Yalavarthi output_buffer++; 101242f3dcd9SSrikanth Yalavarthi } 101342f3dcd9SSrikanth Yalavarthi 101442f3dcd9SSrikanth Yalavarthi return 0; 101542f3dcd9SSrikanth Yalavarthi } 101642f3dcd9SSrikanth Yalavarthi 101742f3dcd9SSrikanth Yalavarthi static inline void 1018*65282e9fSSrikanth Yalavarthi __int64_to_float32_neon_f32x2(const int64_t *input, float *output, float scale, int64_t zero_point) 101942f3dcd9SSrikanth Yalavarthi { 102042f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 102142f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 102242f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 102342f3dcd9SSrikanth Yalavarthi 102442f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 102542f3dcd9SSrikanth Yalavarthi s64x2 = vld1q_s64(input); 102642f3dcd9SSrikanth Yalavarthi 102742f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 102842f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_s64(s64x2); 102942f3dcd9SSrikanth Yalavarthi 103042f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 103142f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 103242f3dcd9SSrikanth Yalavarthi 1033*65282e9fSSrikanth Yalavarthi /* subtract zero_point */ 1034*65282e9fSSrikanth Yalavarthi f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point)); 1035*65282e9fSSrikanth Yalavarthi 103642f3dcd9SSrikanth Yalavarthi /* scale */ 103742f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 103842f3dcd9SSrikanth Yalavarthi 103942f3dcd9SSrikanth Yalavarthi /* store float32x2_t */ 104042f3dcd9SSrikanth Yalavarthi vst1_f32(output, f32x2); 104142f3dcd9SSrikanth Yalavarthi } 104242f3dcd9SSrikanth Yalavarthi 104342f3dcd9SSrikanth Yalavarthi static inline void 1044*65282e9fSSrikanth Yalavarthi __int64_to_float32_neon_f32x1(const int64_t *input, float *output, float scale, int64_t zero_point) 104542f3dcd9SSrikanth Yalavarthi { 104642f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 104742f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 104842f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 104942f3dcd9SSrikanth Yalavarthi 105042f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 105142f3dcd9SSrikanth Yalavarthi s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0); 105242f3dcd9SSrikanth Yalavarthi 105342f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 105442f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_s64(s64x2); 105542f3dcd9SSrikanth Yalavarthi 105642f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 105742f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 105842f3dcd9SSrikanth Yalavarthi 1059*65282e9fSSrikanth Yalavarthi /* subtract zero_point */ 1060*65282e9fSSrikanth Yalavarthi f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point)); 1061*65282e9fSSrikanth Yalavarthi 106242f3dcd9SSrikanth Yalavarthi /* scale */ 106342f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 106442f3dcd9SSrikanth Yalavarthi 1065*65282e9fSSrikanth Yalavarthi /* store float32x2_t lane 0 */ 106642f3dcd9SSrikanth Yalavarthi vst1_lane_f32(output, f32x2, 0); 106742f3dcd9SSrikanth Yalavarthi } 106842f3dcd9SSrikanth Yalavarthi 106942f3dcd9SSrikanth Yalavarthi int 1070*65282e9fSSrikanth Yalavarthi rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 1071*65282e9fSSrikanth Yalavarthi int64_t zero_point) 107242f3dcd9SSrikanth Yalavarthi { 1073*65282e9fSSrikanth Yalavarthi const int64_t *input_buffer; 107442f3dcd9SSrikanth Yalavarthi float *output_buffer; 107542f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 107642f3dcd9SSrikanth Yalavarthi uint32_t vlen; 107742f3dcd9SSrikanth Yalavarthi uint64_t i; 107842f3dcd9SSrikanth Yalavarthi 107942f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 108042f3dcd9SSrikanth Yalavarthi return -EINVAL; 108142f3dcd9SSrikanth Yalavarthi 1082*65282e9fSSrikanth Yalavarthi input_buffer = (const int64_t *)input; 108342f3dcd9SSrikanth Yalavarthi output_buffer = (float *)output; 108442f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(int64_t); 108542f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 108642f3dcd9SSrikanth Yalavarthi 108742f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 108842f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1089*65282e9fSSrikanth Yalavarthi __int64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point); 109042f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 109142f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 109242f3dcd9SSrikanth Yalavarthi } 109342f3dcd9SSrikanth Yalavarthi 109442f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 109542f3dcd9SSrikanth Yalavarthi i = i * vlen; 109642f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 1097*65282e9fSSrikanth Yalavarthi __int64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 109842f3dcd9SSrikanth Yalavarthi input_buffer++; 109942f3dcd9SSrikanth Yalavarthi output_buffer++; 110042f3dcd9SSrikanth Yalavarthi } 110142f3dcd9SSrikanth Yalavarthi 110242f3dcd9SSrikanth Yalavarthi return 0; 110342f3dcd9SSrikanth Yalavarthi } 110442f3dcd9SSrikanth Yalavarthi 110542f3dcd9SSrikanth Yalavarthi static inline void 1106*65282e9fSSrikanth Yalavarthi __float32_to_uint64_neon_u64x2(const float *input, uint64_t *output, float scale, 1107*65282e9fSSrikanth Yalavarthi uint64_t zero_point) 110842f3dcd9SSrikanth Yalavarthi { 110942f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 111042f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 111142f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 111242f3dcd9SSrikanth Yalavarthi 111342f3dcd9SSrikanth Yalavarthi /* load 2 x float elements */ 111442f3dcd9SSrikanth Yalavarthi f32x2 = vld1_f32(input); 111542f3dcd9SSrikanth Yalavarthi 111642f3dcd9SSrikanth Yalavarthi /* scale */ 1117*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 1118*65282e9fSSrikanth Yalavarthi 1119*65282e9fSSrikanth Yalavarthi /* add zero point */ 1120*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 112142f3dcd9SSrikanth Yalavarthi 112242f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 112342f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 112442f3dcd9SSrikanth Yalavarthi 112542f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 112642f3dcd9SSrikanth Yalavarthi u64x2 = vcvtaq_u64_f64(f64x2); 112742f3dcd9SSrikanth Yalavarthi 112842f3dcd9SSrikanth Yalavarthi /* store 2 elements */ 112942f3dcd9SSrikanth Yalavarthi vst1q_u64(output, u64x2); 113042f3dcd9SSrikanth Yalavarthi } 113142f3dcd9SSrikanth Yalavarthi 113242f3dcd9SSrikanth Yalavarthi static inline void 1133*65282e9fSSrikanth Yalavarthi __float32_to_uint64_neon_u64x1(const float *input, uint64_t *output, float scale, 1134*65282e9fSSrikanth Yalavarthi uint64_t zero_point) 113542f3dcd9SSrikanth Yalavarthi { 113642f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 113742f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 113842f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 113942f3dcd9SSrikanth Yalavarthi 114042f3dcd9SSrikanth Yalavarthi /* load 1 x float element */ 114142f3dcd9SSrikanth Yalavarthi f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0); 114242f3dcd9SSrikanth Yalavarthi 114342f3dcd9SSrikanth Yalavarthi /* scale */ 1144*65282e9fSSrikanth Yalavarthi f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale)); 1145*65282e9fSSrikanth Yalavarthi 1146*65282e9fSSrikanth Yalavarthi /* add zero_point */ 1147*65282e9fSSrikanth Yalavarthi f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point)); 114842f3dcd9SSrikanth Yalavarthi 114942f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 115042f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 115142f3dcd9SSrikanth Yalavarthi 115242f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 115342f3dcd9SSrikanth Yalavarthi u64x2 = vcvtaq_u64_f64(f64x2); 115442f3dcd9SSrikanth Yalavarthi 115542f3dcd9SSrikanth Yalavarthi /* store 2 elements */ 115642f3dcd9SSrikanth Yalavarthi vst1q_lane_u64(output, u64x2, 0); 115742f3dcd9SSrikanth Yalavarthi } 115842f3dcd9SSrikanth Yalavarthi 115942f3dcd9SSrikanth Yalavarthi int 1160*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale, 1161*65282e9fSSrikanth Yalavarthi uint64_t zero_point) 116242f3dcd9SSrikanth Yalavarthi { 1163*65282e9fSSrikanth Yalavarthi const float *input_buffer; 116442f3dcd9SSrikanth Yalavarthi uint64_t *output_buffer; 116542f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 116642f3dcd9SSrikanth Yalavarthi uint32_t vlen; 116742f3dcd9SSrikanth Yalavarthi uint64_t i; 116842f3dcd9SSrikanth Yalavarthi 116942f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 117042f3dcd9SSrikanth Yalavarthi return -EINVAL; 117142f3dcd9SSrikanth Yalavarthi 1172*65282e9fSSrikanth Yalavarthi input_buffer = (const float *)input; 117342f3dcd9SSrikanth Yalavarthi output_buffer = (uint64_t *)output; 117442f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(uint64_t); 117542f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 117642f3dcd9SSrikanth Yalavarthi 117742f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 117842f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1179*65282e9fSSrikanth Yalavarthi __float32_to_uint64_neon_u64x2(input_buffer, output_buffer, scale, zero_point); 118042f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 118142f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 118242f3dcd9SSrikanth Yalavarthi } 118342f3dcd9SSrikanth Yalavarthi 118442f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 118542f3dcd9SSrikanth Yalavarthi i = i * vlen; 118642f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 1187*65282e9fSSrikanth Yalavarthi __float32_to_uint64_neon_u64x1(input_buffer, output_buffer, scale, zero_point); 118842f3dcd9SSrikanth Yalavarthi input_buffer++; 118942f3dcd9SSrikanth Yalavarthi output_buffer++; 119042f3dcd9SSrikanth Yalavarthi } 119142f3dcd9SSrikanth Yalavarthi 119242f3dcd9SSrikanth Yalavarthi return 0; 119342f3dcd9SSrikanth Yalavarthi } 119442f3dcd9SSrikanth Yalavarthi 119542f3dcd9SSrikanth Yalavarthi static inline void 1196*65282e9fSSrikanth Yalavarthi __uint64_to_float32_neon_f32x2(const uint64_t *input, float *output, float scale, 1197*65282e9fSSrikanth Yalavarthi uint64_t zero_point) 119842f3dcd9SSrikanth Yalavarthi { 119942f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 120042f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 120142f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 120242f3dcd9SSrikanth Yalavarthi 120342f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 120442f3dcd9SSrikanth Yalavarthi u64x2 = vld1q_u64(input); 120542f3dcd9SSrikanth Yalavarthi 120642f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 120742f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_u64(u64x2); 120842f3dcd9SSrikanth Yalavarthi 120942f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 121042f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 121142f3dcd9SSrikanth Yalavarthi 1212*65282e9fSSrikanth Yalavarthi /* subtract zero_point */ 1213*65282e9fSSrikanth Yalavarthi f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point)); 1214*65282e9fSSrikanth Yalavarthi 121542f3dcd9SSrikanth Yalavarthi /* scale */ 121642f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 121742f3dcd9SSrikanth Yalavarthi 121842f3dcd9SSrikanth Yalavarthi /* store float32x2_t */ 121942f3dcd9SSrikanth Yalavarthi vst1_f32(output, f32x2); 122042f3dcd9SSrikanth Yalavarthi } 122142f3dcd9SSrikanth Yalavarthi 122242f3dcd9SSrikanth Yalavarthi static inline void 1223*65282e9fSSrikanth Yalavarthi __uint64_to_float32_neon_f32x1(const uint64_t *input, float *output, float scale, 1224*65282e9fSSrikanth Yalavarthi uint64_t zero_point) 122542f3dcd9SSrikanth Yalavarthi { 122642f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 122742f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 122842f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 122942f3dcd9SSrikanth Yalavarthi 123042f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 123142f3dcd9SSrikanth Yalavarthi u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0); 123242f3dcd9SSrikanth Yalavarthi 123342f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 123442f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_u64(u64x2); 123542f3dcd9SSrikanth Yalavarthi 123642f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 123742f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 123842f3dcd9SSrikanth Yalavarthi 1239*65282e9fSSrikanth Yalavarthi /* subtract zero_point */ 1240*65282e9fSSrikanth Yalavarthi f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point)); 1241*65282e9fSSrikanth Yalavarthi 124242f3dcd9SSrikanth Yalavarthi /* scale */ 124342f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 124442f3dcd9SSrikanth Yalavarthi 1245*65282e9fSSrikanth Yalavarthi /* store float32x2_t lane 0 */ 124642f3dcd9SSrikanth Yalavarthi vst1_lane_f32(output, f32x2, 0); 124742f3dcd9SSrikanth Yalavarthi } 124842f3dcd9SSrikanth Yalavarthi 124942f3dcd9SSrikanth Yalavarthi int 1250*65282e9fSSrikanth Yalavarthi rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale, 1251*65282e9fSSrikanth Yalavarthi uint64_t zero_point) 125242f3dcd9SSrikanth Yalavarthi { 1253*65282e9fSSrikanth Yalavarthi const uint64_t *input_buffer; 125442f3dcd9SSrikanth Yalavarthi float *output_buffer; 125542f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 125642f3dcd9SSrikanth Yalavarthi uint32_t vlen; 125742f3dcd9SSrikanth Yalavarthi uint64_t i; 125842f3dcd9SSrikanth Yalavarthi 125942f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 126042f3dcd9SSrikanth Yalavarthi return -EINVAL; 126142f3dcd9SSrikanth Yalavarthi 1262*65282e9fSSrikanth Yalavarthi input_buffer = (const uint64_t *)input; 126342f3dcd9SSrikanth Yalavarthi output_buffer = (float *)output; 126442f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(uint64_t); 126542f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 126642f3dcd9SSrikanth Yalavarthi 126742f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 126842f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1269*65282e9fSSrikanth Yalavarthi __uint64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point); 127042f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 127142f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 127242f3dcd9SSrikanth Yalavarthi } 127342f3dcd9SSrikanth Yalavarthi 127442f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 127542f3dcd9SSrikanth Yalavarthi i = i * vlen; 127642f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 1277*65282e9fSSrikanth Yalavarthi __uint64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point); 127842f3dcd9SSrikanth Yalavarthi input_buffer++; 127942f3dcd9SSrikanth Yalavarthi output_buffer++; 128042f3dcd9SSrikanth Yalavarthi } 128142f3dcd9SSrikanth Yalavarthi 128242f3dcd9SSrikanth Yalavarthi return 0; 128342f3dcd9SSrikanth Yalavarthi } 128442f3dcd9SSrikanth Yalavarthi 128542f3dcd9SSrikanth Yalavarthi static inline void 1286*65282e9fSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(const float32_t *input, float16_t *output) 1287fc54766bSSrikanth Yalavarthi { 1288fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1289fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1290fc54766bSSrikanth Yalavarthi 1291fc54766bSSrikanth Yalavarthi /* load 4 x float32_t elements */ 1292fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 1293fc54766bSSrikanth Yalavarthi 1294fc54766bSSrikanth Yalavarthi /* convert to float16x4_t */ 1295fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 1296fc54766bSSrikanth Yalavarthi 1297fc54766bSSrikanth Yalavarthi /* store float16x4_t */ 1298fc54766bSSrikanth Yalavarthi vst1_f16(output, f16x4); 1299fc54766bSSrikanth Yalavarthi } 1300fc54766bSSrikanth Yalavarthi 1301fc54766bSSrikanth Yalavarthi static inline void 1302*65282e9fSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(const float32_t *input, float16_t *output) 1303fc54766bSSrikanth Yalavarthi { 1304fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1305fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1306fc54766bSSrikanth Yalavarthi 1307fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 1308fc54766bSSrikanth Yalavarthi f32x4 = vld1q_dup_f32(input); 1309fc54766bSSrikanth Yalavarthi 1310fc54766bSSrikanth Yalavarthi /* convert float32_t to float16_t */ 1311fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 1312fc54766bSSrikanth Yalavarthi 1313fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 1314fc54766bSSrikanth Yalavarthi vst1_lane_f16(output, f16x4, 0); 1315fc54766bSSrikanth Yalavarthi } 1316fc54766bSSrikanth Yalavarthi 1317fc54766bSSrikanth Yalavarthi int 1318*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements) 1319fc54766bSSrikanth Yalavarthi { 1320*65282e9fSSrikanth Yalavarthi const float32_t *input_buffer; 1321fc54766bSSrikanth Yalavarthi float16_t *output_buffer; 1322fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 1323fc54766bSSrikanth Yalavarthi uint32_t vlen; 1324fc54766bSSrikanth Yalavarthi uint64_t i; 1325fc54766bSSrikanth Yalavarthi 1326fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1327fc54766bSSrikanth Yalavarthi return -EINVAL; 1328fc54766bSSrikanth Yalavarthi 1329*65282e9fSSrikanth Yalavarthi input_buffer = (const float32_t *)input; 1330fc54766bSSrikanth Yalavarthi output_buffer = (float16_t *)output; 1331fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1332fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 1333fc54766bSSrikanth Yalavarthi 1334fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 1335fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1336fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 1337fc54766bSSrikanth Yalavarthi input_buffer += vlen; 1338fc54766bSSrikanth Yalavarthi output_buffer += vlen; 1339fc54766bSSrikanth Yalavarthi } 1340fc54766bSSrikanth Yalavarthi 1341fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 1342fc54766bSSrikanth Yalavarthi i = i * vlen; 1343fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 1344fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 1345fc54766bSSrikanth Yalavarthi input_buffer++; 1346fc54766bSSrikanth Yalavarthi output_buffer++; 1347fc54766bSSrikanth Yalavarthi } 1348fc54766bSSrikanth Yalavarthi 1349fc54766bSSrikanth Yalavarthi return 0; 1350fc54766bSSrikanth Yalavarthi } 1351fc54766bSSrikanth Yalavarthi 1352fc54766bSSrikanth Yalavarthi static inline void 1353*65282e9fSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(const float16_t *input, float32_t *output) 1354fc54766bSSrikanth Yalavarthi { 1355fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1356fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1357fc54766bSSrikanth Yalavarthi 1358fc54766bSSrikanth Yalavarthi /* load 4 x float16_t elements */ 1359fc54766bSSrikanth Yalavarthi f16x4 = vld1_f16(input); 1360fc54766bSSrikanth Yalavarthi 1361fc54766bSSrikanth Yalavarthi /* convert float16x4_t to float32x4_t */ 1362fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 1363fc54766bSSrikanth Yalavarthi 1364fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 1365fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 1366fc54766bSSrikanth Yalavarthi } 1367fc54766bSSrikanth Yalavarthi 1368fc54766bSSrikanth Yalavarthi static inline void 1369*65282e9fSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(const float16_t *input, float32_t *output) 1370fc54766bSSrikanth Yalavarthi { 1371fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1372fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1373fc54766bSSrikanth Yalavarthi 1374fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 1375fc54766bSSrikanth Yalavarthi f16x4 = vld1_dup_f16(input); 1376fc54766bSSrikanth Yalavarthi 1377fc54766bSSrikanth Yalavarthi /* convert float16_t to float32_t */ 1378fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 1379fc54766bSSrikanth Yalavarthi 1380fc54766bSSrikanth Yalavarthi /* store 1 element */ 1381fc54766bSSrikanth Yalavarthi vst1q_lane_f32(output, f32x4, 0); 1382fc54766bSSrikanth Yalavarthi } 1383fc54766bSSrikanth Yalavarthi 1384fc54766bSSrikanth Yalavarthi int 1385*65282e9fSSrikanth Yalavarthi rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements) 1386fc54766bSSrikanth Yalavarthi { 1387*65282e9fSSrikanth Yalavarthi const float16_t *input_buffer; 1388fc54766bSSrikanth Yalavarthi float32_t *output_buffer; 1389fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 1390fc54766bSSrikanth Yalavarthi uint32_t vlen; 1391fc54766bSSrikanth Yalavarthi uint64_t i; 1392fc54766bSSrikanth Yalavarthi 1393fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1394fc54766bSSrikanth Yalavarthi return -EINVAL; 1395fc54766bSSrikanth Yalavarthi 1396*65282e9fSSrikanth Yalavarthi input_buffer = (const float16_t *)input; 1397fc54766bSSrikanth Yalavarthi output_buffer = (float32_t *)output; 1398fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1399fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 1400fc54766bSSrikanth Yalavarthi 1401fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 1402fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1403fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 1404fc54766bSSrikanth Yalavarthi input_buffer += vlen; 1405fc54766bSSrikanth Yalavarthi output_buffer += vlen; 1406fc54766bSSrikanth Yalavarthi } 1407fc54766bSSrikanth Yalavarthi 1408fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 1409fc54766bSSrikanth Yalavarthi i = i * vlen; 1410fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 1411fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 1412fc54766bSSrikanth Yalavarthi input_buffer++; 1413fc54766bSSrikanth Yalavarthi output_buffer++; 1414fc54766bSSrikanth Yalavarthi } 1415fc54766bSSrikanth Yalavarthi 1416fc54766bSSrikanth Yalavarthi return 0; 1417fc54766bSSrikanth Yalavarthi } 1418