1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause 2fc54766bSSrikanth Yalavarthi * Copyright (c) 2022 Marvell. 3fc54766bSSrikanth Yalavarthi */ 4fc54766bSSrikanth Yalavarthi 5fc54766bSSrikanth Yalavarthi #include <errno.h> 6fc54766bSSrikanth Yalavarthi #include <stdint.h> 7fc54766bSSrikanth Yalavarthi #include <stdlib.h> 8fc54766bSSrikanth Yalavarthi 9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h" 10fc54766bSSrikanth Yalavarthi 11fc54766bSSrikanth Yalavarthi #include <arm_neon.h> 12fc54766bSSrikanth Yalavarthi 13fc54766bSSrikanth Yalavarthi /* Description: 14fc54766bSSrikanth Yalavarthi * This file implements vector versions of Machine Learning utility functions used to convert data 15*538f6997SSrikanth Yalavarthi * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16*538f6997SSrikanth Yalavarthi * is based on Arm Neon intrinsics. 17fc54766bSSrikanth Yalavarthi */ 18fc54766bSSrikanth Yalavarthi 19fc54766bSSrikanth Yalavarthi static inline void 20fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output) 21fc54766bSSrikanth Yalavarthi { 22fc54766bSSrikanth Yalavarthi int16x4_t s16x4_l; 23fc54766bSSrikanth Yalavarthi int16x4_t s16x4_h; 24fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 25fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 26fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 27fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 28fc54766bSSrikanth Yalavarthi 29fc54766bSSrikanth Yalavarthi /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 31fc54766bSSrikanth Yalavarthi */ 32fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 33fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 34fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 35fc54766bSSrikanth Yalavarthi s16x4_l = vqmovn_s32(s32x4); 36fc54766bSSrikanth Yalavarthi 37fc54766bSSrikanth Yalavarthi /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 38fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 39fc54766bSSrikanth Yalavarthi */ 40fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 41fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 42fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 43fc54766bSSrikanth Yalavarthi s16x4_h = vqmovn_s32(s32x4); 44fc54766bSSrikanth Yalavarthi 45fc54766bSSrikanth Yalavarthi /* combine lower and higher int16x4_t to int16x8_t */ 46fc54766bSSrikanth Yalavarthi s16x8 = vcombine_s16(s16x4_l, s16x4_h); 47fc54766bSSrikanth Yalavarthi 48fc54766bSSrikanth Yalavarthi /* narrow to int8_t */ 49fc54766bSSrikanth Yalavarthi s8x8 = vqmovn_s16(s16x8); 50fc54766bSSrikanth Yalavarthi 51fc54766bSSrikanth Yalavarthi /* store 8 elements */ 52fc54766bSSrikanth Yalavarthi vst1_s8(output, s8x8); 53fc54766bSSrikanth Yalavarthi } 54fc54766bSSrikanth Yalavarthi 55fc54766bSSrikanth Yalavarthi static inline void 56fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output) 57fc54766bSSrikanth Yalavarthi { 58fc54766bSSrikanth Yalavarthi int32_t s32; 59fc54766bSSrikanth Yalavarthi int16_t s16; 60fc54766bSSrikanth Yalavarthi 61fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 62fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 63fc54766bSSrikanth Yalavarthi 64fc54766bSSrikanth Yalavarthi /* saturate narrow */ 65fc54766bSSrikanth Yalavarthi s16 = vqmovns_s32(s32); 66fc54766bSSrikanth Yalavarthi 67fc54766bSSrikanth Yalavarthi /* convert to int8_t */ 68fc54766bSSrikanth Yalavarthi *output = vqmovnh_s16(s16); 69fc54766bSSrikanth Yalavarthi } 70fc54766bSSrikanth Yalavarthi 71fc54766bSSrikanth Yalavarthi int 72fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output) 73fc54766bSSrikanth Yalavarthi { 74fc54766bSSrikanth Yalavarthi float *input_buffer; 75fc54766bSSrikanth Yalavarthi int8_t *output_buffer; 76fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 77fc54766bSSrikanth Yalavarthi uint32_t vlen; 78fc54766bSSrikanth Yalavarthi uint64_t i; 79fc54766bSSrikanth Yalavarthi 80fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 81fc54766bSSrikanth Yalavarthi return -EINVAL; 82fc54766bSSrikanth Yalavarthi 83fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 84fc54766bSSrikanth Yalavarthi output_buffer = (int8_t *)output; 85fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 86fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 87fc54766bSSrikanth Yalavarthi 88fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 89fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 90fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer); 91fc54766bSSrikanth Yalavarthi input_buffer += vlen; 92fc54766bSSrikanth Yalavarthi output_buffer += vlen; 93fc54766bSSrikanth Yalavarthi } 94fc54766bSSrikanth Yalavarthi 95fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 96fc54766bSSrikanth Yalavarthi i = i * vlen; 97fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 98fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer); 99fc54766bSSrikanth Yalavarthi input_buffer++; 100fc54766bSSrikanth Yalavarthi output_buffer++; 101fc54766bSSrikanth Yalavarthi } 102fc54766bSSrikanth Yalavarthi 103fc54766bSSrikanth Yalavarthi return 0; 104fc54766bSSrikanth Yalavarthi } 105fc54766bSSrikanth Yalavarthi 106fc54766bSSrikanth Yalavarthi static inline void 107fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output) 108fc54766bSSrikanth Yalavarthi { 109fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 110fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 111fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 112fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 113fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 114fc54766bSSrikanth Yalavarthi 115fc54766bSSrikanth Yalavarthi /* load 8 x int8_t elements */ 116fc54766bSSrikanth Yalavarthi s8x8 = vld1_s8(input); 117fc54766bSSrikanth Yalavarthi 118fc54766bSSrikanth Yalavarthi /* widen int8_t to int16_t */ 119fc54766bSSrikanth Yalavarthi s16x8 = vmovl_s8(s8x8); 120fc54766bSSrikanth Yalavarthi 121fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 122fc54766bSSrikanth Yalavarthi s16x4 = vget_low_s16(s16x8); 123fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 124fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 125fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 126fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 127fc54766bSSrikanth Yalavarthi 128fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 129fc54766bSSrikanth Yalavarthi s16x4 = vget_high_s16(s16x8); 130fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 131fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 132fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 133fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 134fc54766bSSrikanth Yalavarthi } 135fc54766bSSrikanth Yalavarthi 136fc54766bSSrikanth Yalavarthi static inline void 137fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output) 138fc54766bSSrikanth Yalavarthi { 139fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 140fc54766bSSrikanth Yalavarthi } 141fc54766bSSrikanth Yalavarthi 142fc54766bSSrikanth Yalavarthi int 143fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 144fc54766bSSrikanth Yalavarthi { 145fc54766bSSrikanth Yalavarthi int8_t *input_buffer; 146fc54766bSSrikanth Yalavarthi float *output_buffer; 147fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 148fc54766bSSrikanth Yalavarthi uint32_t vlen; 149fc54766bSSrikanth Yalavarthi uint64_t i; 150fc54766bSSrikanth Yalavarthi 151fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 152fc54766bSSrikanth Yalavarthi return -EINVAL; 153fc54766bSSrikanth Yalavarthi 154fc54766bSSrikanth Yalavarthi input_buffer = (int8_t *)input; 155fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 156fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 157fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 158fc54766bSSrikanth Yalavarthi 159fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 160fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 161fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 162fc54766bSSrikanth Yalavarthi input_buffer += vlen; 163fc54766bSSrikanth Yalavarthi output_buffer += vlen; 164fc54766bSSrikanth Yalavarthi } 165fc54766bSSrikanth Yalavarthi 166fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 167fc54766bSSrikanth Yalavarthi i = i * vlen; 168fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 169fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 170fc54766bSSrikanth Yalavarthi input_buffer++; 171fc54766bSSrikanth Yalavarthi output_buffer++; 172fc54766bSSrikanth Yalavarthi } 173fc54766bSSrikanth Yalavarthi 174fc54766bSSrikanth Yalavarthi return 0; 175fc54766bSSrikanth Yalavarthi } 176fc54766bSSrikanth Yalavarthi 177fc54766bSSrikanth Yalavarthi static inline void 178fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output) 179fc54766bSSrikanth Yalavarthi { 180fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_l; 181fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_h; 182fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 183fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 184fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 185fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 186fc54766bSSrikanth Yalavarthi 187fc54766bSSrikanth Yalavarthi /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 188fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 189fc54766bSSrikanth Yalavarthi */ 190fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 191fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 192fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 193fc54766bSSrikanth Yalavarthi u16x4_l = vqmovn_u32(u32x4); 194fc54766bSSrikanth Yalavarthi 195fc54766bSSrikanth Yalavarthi /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 196fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 197fc54766bSSrikanth Yalavarthi */ 198fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 199fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 200fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 201fc54766bSSrikanth Yalavarthi u16x4_h = vqmovn_u32(u32x4); 202fc54766bSSrikanth Yalavarthi 203fc54766bSSrikanth Yalavarthi /* combine lower and higher uint16x4_t */ 204fc54766bSSrikanth Yalavarthi u16x8 = vcombine_u16(u16x4_l, u16x4_h); 205fc54766bSSrikanth Yalavarthi 206fc54766bSSrikanth Yalavarthi /* narrow to uint8x8_t */ 207fc54766bSSrikanth Yalavarthi u8x8 = vqmovn_u16(u16x8); 208fc54766bSSrikanth Yalavarthi 209fc54766bSSrikanth Yalavarthi /* store 8 elements */ 210fc54766bSSrikanth Yalavarthi vst1_u8(output, u8x8); 211fc54766bSSrikanth Yalavarthi } 212fc54766bSSrikanth Yalavarthi 213fc54766bSSrikanth Yalavarthi static inline void 214fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output) 215fc54766bSSrikanth Yalavarthi { 216fc54766bSSrikanth Yalavarthi uint32_t u32; 217fc54766bSSrikanth Yalavarthi uint16_t u16; 218fc54766bSSrikanth Yalavarthi 219fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 220fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 221fc54766bSSrikanth Yalavarthi 222fc54766bSSrikanth Yalavarthi /* saturate narrow */ 223fc54766bSSrikanth Yalavarthi u16 = vqmovns_u32(u32); 224fc54766bSSrikanth Yalavarthi 225fc54766bSSrikanth Yalavarthi /* convert to uint8_t */ 226fc54766bSSrikanth Yalavarthi *output = vqmovnh_u16(u16); 227fc54766bSSrikanth Yalavarthi } 228fc54766bSSrikanth Yalavarthi 229fc54766bSSrikanth Yalavarthi int 230fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output) 231fc54766bSSrikanth Yalavarthi { 232fc54766bSSrikanth Yalavarthi float *input_buffer; 233fc54766bSSrikanth Yalavarthi uint8_t *output_buffer; 234fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 235fc54766bSSrikanth Yalavarthi uint32_t vlen; 236fc54766bSSrikanth Yalavarthi uint64_t i; 237fc54766bSSrikanth Yalavarthi 238fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 239fc54766bSSrikanth Yalavarthi return -EINVAL; 240fc54766bSSrikanth Yalavarthi 241fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 242fc54766bSSrikanth Yalavarthi output_buffer = (uint8_t *)output; 243fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 244fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 245fc54766bSSrikanth Yalavarthi 246fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 247fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 248fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer); 249fc54766bSSrikanth Yalavarthi input_buffer += vlen; 250fc54766bSSrikanth Yalavarthi output_buffer += vlen; 251fc54766bSSrikanth Yalavarthi } 252fc54766bSSrikanth Yalavarthi 253fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 254fc54766bSSrikanth Yalavarthi i = i * vlen; 255fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 256fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer); 257fc54766bSSrikanth Yalavarthi input_buffer++; 258fc54766bSSrikanth Yalavarthi output_buffer++; 259fc54766bSSrikanth Yalavarthi } 260fc54766bSSrikanth Yalavarthi 261fc54766bSSrikanth Yalavarthi return 0; 262fc54766bSSrikanth Yalavarthi } 263fc54766bSSrikanth Yalavarthi 264fc54766bSSrikanth Yalavarthi static inline void 265fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output) 266fc54766bSSrikanth Yalavarthi { 267fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 268fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 269fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 270fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 271fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 272fc54766bSSrikanth Yalavarthi 273fc54766bSSrikanth Yalavarthi /* load 8 x uint8_t elements */ 274fc54766bSSrikanth Yalavarthi u8x8 = vld1_u8(input); 275fc54766bSSrikanth Yalavarthi 276fc54766bSSrikanth Yalavarthi /* widen uint8_t to uint16_t */ 277fc54766bSSrikanth Yalavarthi u16x8 = vmovl_u8(u8x8); 278fc54766bSSrikanth Yalavarthi 279fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 280fc54766bSSrikanth Yalavarthi u16x4 = vget_low_u16(u16x8); 281fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 282fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 283fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 284fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 285fc54766bSSrikanth Yalavarthi 286fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 287fc54766bSSrikanth Yalavarthi u16x4 = vget_high_u16(u16x8); 288fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 289fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 290fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 291fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 292fc54766bSSrikanth Yalavarthi } 293fc54766bSSrikanth Yalavarthi 294fc54766bSSrikanth Yalavarthi static inline void 295fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output) 296fc54766bSSrikanth Yalavarthi { 297fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 298fc54766bSSrikanth Yalavarthi } 299fc54766bSSrikanth Yalavarthi 300fc54766bSSrikanth Yalavarthi int 301fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 302fc54766bSSrikanth Yalavarthi { 303fc54766bSSrikanth Yalavarthi uint8_t *input_buffer; 304fc54766bSSrikanth Yalavarthi float *output_buffer; 305fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 306fc54766bSSrikanth Yalavarthi uint64_t vlen; 307fc54766bSSrikanth Yalavarthi uint64_t i; 308fc54766bSSrikanth Yalavarthi 309fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 310fc54766bSSrikanth Yalavarthi return -EINVAL; 311fc54766bSSrikanth Yalavarthi 312fc54766bSSrikanth Yalavarthi input_buffer = (uint8_t *)input; 313fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 314fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 315fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 316fc54766bSSrikanth Yalavarthi 317fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 318fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 319fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 320fc54766bSSrikanth Yalavarthi input_buffer += vlen; 321fc54766bSSrikanth Yalavarthi output_buffer += vlen; 322fc54766bSSrikanth Yalavarthi } 323fc54766bSSrikanth Yalavarthi 324fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 325fc54766bSSrikanth Yalavarthi i = i * vlen; 326fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 327fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 328fc54766bSSrikanth Yalavarthi input_buffer++; 329fc54766bSSrikanth Yalavarthi output_buffer++; 330fc54766bSSrikanth Yalavarthi } 331fc54766bSSrikanth Yalavarthi 332fc54766bSSrikanth Yalavarthi return 0; 333fc54766bSSrikanth Yalavarthi } 334fc54766bSSrikanth Yalavarthi 335fc54766bSSrikanth Yalavarthi static inline void 336fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output) 337fc54766bSSrikanth Yalavarthi { 338fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 339fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 340fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 341fc54766bSSrikanth Yalavarthi 342fc54766bSSrikanth Yalavarthi /* load 4 x float elements */ 343fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 344fc54766bSSrikanth Yalavarthi 345fc54766bSSrikanth Yalavarthi /* scale */ 346fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 347fc54766bSSrikanth Yalavarthi 348fc54766bSSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 349fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 350fc54766bSSrikanth Yalavarthi 351fc54766bSSrikanth Yalavarthi /* saturate narrow to int16x4_t */ 352fc54766bSSrikanth Yalavarthi s16x4 = vqmovn_s32(s32x4); 353fc54766bSSrikanth Yalavarthi 354fc54766bSSrikanth Yalavarthi /* store 4 elements */ 355fc54766bSSrikanth Yalavarthi vst1_s16(output, s16x4); 356fc54766bSSrikanth Yalavarthi } 357fc54766bSSrikanth Yalavarthi 358fc54766bSSrikanth Yalavarthi static inline void 359fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output) 360fc54766bSSrikanth Yalavarthi { 361fc54766bSSrikanth Yalavarthi int32_t s32; 362fc54766bSSrikanth Yalavarthi 363fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 364fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 365fc54766bSSrikanth Yalavarthi 366fc54766bSSrikanth Yalavarthi /* saturate narrow */ 367fc54766bSSrikanth Yalavarthi *output = vqmovns_s32(s32); 368fc54766bSSrikanth Yalavarthi } 369fc54766bSSrikanth Yalavarthi 370fc54766bSSrikanth Yalavarthi int 371fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output) 372fc54766bSSrikanth Yalavarthi { 373fc54766bSSrikanth Yalavarthi float *input_buffer; 374fc54766bSSrikanth Yalavarthi int16_t *output_buffer; 375fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 376fc54766bSSrikanth Yalavarthi uint32_t vlen; 377fc54766bSSrikanth Yalavarthi uint64_t i; 378fc54766bSSrikanth Yalavarthi 379fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 380fc54766bSSrikanth Yalavarthi return -EINVAL; 381fc54766bSSrikanth Yalavarthi 382fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 383fc54766bSSrikanth Yalavarthi output_buffer = (int16_t *)output; 384fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 385fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 386fc54766bSSrikanth Yalavarthi 387fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 388fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 389fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer); 390fc54766bSSrikanth Yalavarthi input_buffer += vlen; 391fc54766bSSrikanth Yalavarthi output_buffer += vlen; 392fc54766bSSrikanth Yalavarthi } 393fc54766bSSrikanth Yalavarthi 394fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 395fc54766bSSrikanth Yalavarthi i = i * vlen; 396fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 397fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer); 398fc54766bSSrikanth Yalavarthi input_buffer++; 399fc54766bSSrikanth Yalavarthi output_buffer++; 400fc54766bSSrikanth Yalavarthi } 401fc54766bSSrikanth Yalavarthi 402fc54766bSSrikanth Yalavarthi return 0; 403fc54766bSSrikanth Yalavarthi } 404fc54766bSSrikanth Yalavarthi 405fc54766bSSrikanth Yalavarthi static inline void 406fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output) 407fc54766bSSrikanth Yalavarthi { 408fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 409fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 410fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 411fc54766bSSrikanth Yalavarthi 412fc54766bSSrikanth Yalavarthi /* load 4 x int16_t elements */ 413fc54766bSSrikanth Yalavarthi s16x4 = vld1_s16(input); 414fc54766bSSrikanth Yalavarthi 415fc54766bSSrikanth Yalavarthi /* widen int16_t to int32_t */ 416fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 417fc54766bSSrikanth Yalavarthi 418fc54766bSSrikanth Yalavarthi /* convert int32_t to float */ 419fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 420fc54766bSSrikanth Yalavarthi 421fc54766bSSrikanth Yalavarthi /* scale */ 422fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 423fc54766bSSrikanth Yalavarthi 424fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 425fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 426fc54766bSSrikanth Yalavarthi } 427fc54766bSSrikanth Yalavarthi 428fc54766bSSrikanth Yalavarthi static inline void 429fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output) 430fc54766bSSrikanth Yalavarthi { 431fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 432fc54766bSSrikanth Yalavarthi } 433fc54766bSSrikanth Yalavarthi 434fc54766bSSrikanth Yalavarthi int 435fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 436fc54766bSSrikanth Yalavarthi { 437fc54766bSSrikanth Yalavarthi int16_t *input_buffer; 438fc54766bSSrikanth Yalavarthi float *output_buffer; 439fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 440fc54766bSSrikanth Yalavarthi uint32_t vlen; 441fc54766bSSrikanth Yalavarthi uint64_t i; 442fc54766bSSrikanth Yalavarthi 443fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 444fc54766bSSrikanth Yalavarthi return -EINVAL; 445fc54766bSSrikanth Yalavarthi 446fc54766bSSrikanth Yalavarthi input_buffer = (int16_t *)input; 447fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 448fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 449fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 450fc54766bSSrikanth Yalavarthi 451fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 452fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 453fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 454fc54766bSSrikanth Yalavarthi input_buffer += vlen; 455fc54766bSSrikanth Yalavarthi output_buffer += vlen; 456fc54766bSSrikanth Yalavarthi } 457fc54766bSSrikanth Yalavarthi 458fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 459fc54766bSSrikanth Yalavarthi i = i * vlen; 460fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 461fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 462fc54766bSSrikanth Yalavarthi input_buffer++; 463fc54766bSSrikanth Yalavarthi output_buffer++; 464fc54766bSSrikanth Yalavarthi } 465fc54766bSSrikanth Yalavarthi 466fc54766bSSrikanth Yalavarthi return 0; 467fc54766bSSrikanth Yalavarthi } 468fc54766bSSrikanth Yalavarthi 469fc54766bSSrikanth Yalavarthi static inline void 470fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output) 471fc54766bSSrikanth Yalavarthi { 472fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 473fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 474fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 475fc54766bSSrikanth Yalavarthi 476fc54766bSSrikanth Yalavarthi /* load 4 float elements */ 477fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 478fc54766bSSrikanth Yalavarthi 479fc54766bSSrikanth Yalavarthi /* scale */ 480fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 481fc54766bSSrikanth Yalavarthi 482fc54766bSSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 483fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 484fc54766bSSrikanth Yalavarthi 485fc54766bSSrikanth Yalavarthi /* saturate narrow */ 486fc54766bSSrikanth Yalavarthi u16x4 = vqmovn_u32(u32x4); 487fc54766bSSrikanth Yalavarthi 488fc54766bSSrikanth Yalavarthi /* store 4 elements */ 489fc54766bSSrikanth Yalavarthi vst1_u16(output, u16x4); 490fc54766bSSrikanth Yalavarthi } 491fc54766bSSrikanth Yalavarthi 492fc54766bSSrikanth Yalavarthi static inline void 493fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output) 494fc54766bSSrikanth Yalavarthi { 495fc54766bSSrikanth Yalavarthi uint32_t u32; 496fc54766bSSrikanth Yalavarthi 497fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 498fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 499fc54766bSSrikanth Yalavarthi 500fc54766bSSrikanth Yalavarthi /* saturate narrow */ 501fc54766bSSrikanth Yalavarthi *output = vqmovns_u32(u32); 502fc54766bSSrikanth Yalavarthi } 503fc54766bSSrikanth Yalavarthi 504fc54766bSSrikanth Yalavarthi int 505fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output) 506fc54766bSSrikanth Yalavarthi { 507fc54766bSSrikanth Yalavarthi float *input_buffer; 508fc54766bSSrikanth Yalavarthi uint16_t *output_buffer; 509fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 510fc54766bSSrikanth Yalavarthi uint64_t vlen; 511fc54766bSSrikanth Yalavarthi uint64_t i; 512fc54766bSSrikanth Yalavarthi 513fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 514fc54766bSSrikanth Yalavarthi return -EINVAL; 515fc54766bSSrikanth Yalavarthi 516fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 517fc54766bSSrikanth Yalavarthi output_buffer = (uint16_t *)output; 518fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 519fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 520fc54766bSSrikanth Yalavarthi 521fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 522fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 523fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer); 524fc54766bSSrikanth Yalavarthi input_buffer += vlen; 525fc54766bSSrikanth Yalavarthi output_buffer += vlen; 526fc54766bSSrikanth Yalavarthi } 527fc54766bSSrikanth Yalavarthi 528fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 529fc54766bSSrikanth Yalavarthi i = i * vlen; 530fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 531fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer); 532fc54766bSSrikanth Yalavarthi input_buffer++; 533fc54766bSSrikanth Yalavarthi output_buffer++; 534fc54766bSSrikanth Yalavarthi } 535fc54766bSSrikanth Yalavarthi 536fc54766bSSrikanth Yalavarthi return 0; 537fc54766bSSrikanth Yalavarthi } 538fc54766bSSrikanth Yalavarthi 539fc54766bSSrikanth Yalavarthi static inline void 540fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output) 541fc54766bSSrikanth Yalavarthi { 542fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 543fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 544fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 545fc54766bSSrikanth Yalavarthi 546fc54766bSSrikanth Yalavarthi /* load 4 x uint16_t elements */ 547fc54766bSSrikanth Yalavarthi u16x4 = vld1_u16(input); 548fc54766bSSrikanth Yalavarthi 549fc54766bSSrikanth Yalavarthi /* widen uint16_t to uint32_t */ 550fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 551fc54766bSSrikanth Yalavarthi 552fc54766bSSrikanth Yalavarthi /* convert uint32_t to float */ 553fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 554fc54766bSSrikanth Yalavarthi 555fc54766bSSrikanth Yalavarthi /* scale */ 556fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 557fc54766bSSrikanth Yalavarthi 558fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 559fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 560fc54766bSSrikanth Yalavarthi } 561fc54766bSSrikanth Yalavarthi 562fc54766bSSrikanth Yalavarthi static inline void 563fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output) 564fc54766bSSrikanth Yalavarthi { 565fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 566fc54766bSSrikanth Yalavarthi } 567fc54766bSSrikanth Yalavarthi 568fc54766bSSrikanth Yalavarthi int 569fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 570fc54766bSSrikanth Yalavarthi { 571fc54766bSSrikanth Yalavarthi uint16_t *input_buffer; 572fc54766bSSrikanth Yalavarthi float *output_buffer; 573fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 574fc54766bSSrikanth Yalavarthi uint32_t vlen; 575fc54766bSSrikanth Yalavarthi uint64_t i; 576fc54766bSSrikanth Yalavarthi 577fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 578fc54766bSSrikanth Yalavarthi return -EINVAL; 579fc54766bSSrikanth Yalavarthi 580fc54766bSSrikanth Yalavarthi input_buffer = (uint16_t *)input; 581fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 582fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 583fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 584fc54766bSSrikanth Yalavarthi 585fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 586fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 587fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 588fc54766bSSrikanth Yalavarthi input_buffer += vlen; 589fc54766bSSrikanth Yalavarthi output_buffer += vlen; 590fc54766bSSrikanth Yalavarthi } 591fc54766bSSrikanth Yalavarthi 592fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 593fc54766bSSrikanth Yalavarthi i = i * vlen; 594fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 595fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 596fc54766bSSrikanth Yalavarthi input_buffer++; 597fc54766bSSrikanth Yalavarthi output_buffer++; 598fc54766bSSrikanth Yalavarthi } 599fc54766bSSrikanth Yalavarthi 600fc54766bSSrikanth Yalavarthi return 0; 601fc54766bSSrikanth Yalavarthi } 602fc54766bSSrikanth Yalavarthi 603fc54766bSSrikanth Yalavarthi static inline void 604fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output) 605fc54766bSSrikanth Yalavarthi { 606fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 607fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 608fc54766bSSrikanth Yalavarthi 609fc54766bSSrikanth Yalavarthi /* load 4 x float32_t elements */ 610fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 611fc54766bSSrikanth Yalavarthi 612fc54766bSSrikanth Yalavarthi /* convert to float16x4_t */ 613fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 614fc54766bSSrikanth Yalavarthi 615fc54766bSSrikanth Yalavarthi /* store float16x4_t */ 616fc54766bSSrikanth Yalavarthi vst1_f16(output, f16x4); 617fc54766bSSrikanth Yalavarthi } 618fc54766bSSrikanth Yalavarthi 619fc54766bSSrikanth Yalavarthi static inline void 620fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output) 621fc54766bSSrikanth Yalavarthi { 622fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 623fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 624fc54766bSSrikanth Yalavarthi 625fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 626fc54766bSSrikanth Yalavarthi f32x4 = vld1q_dup_f32(input); 627fc54766bSSrikanth Yalavarthi 628fc54766bSSrikanth Yalavarthi /* convert float32_t to float16_t */ 629fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 630fc54766bSSrikanth Yalavarthi 631fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 632fc54766bSSrikanth Yalavarthi vst1_lane_f16(output, f16x4, 0); 633fc54766bSSrikanth Yalavarthi } 634fc54766bSSrikanth Yalavarthi 635fc54766bSSrikanth Yalavarthi int 636fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output) 637fc54766bSSrikanth Yalavarthi { 638fc54766bSSrikanth Yalavarthi float32_t *input_buffer; 639fc54766bSSrikanth Yalavarthi float16_t *output_buffer; 640fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 641fc54766bSSrikanth Yalavarthi uint32_t vlen; 642fc54766bSSrikanth Yalavarthi uint64_t i; 643fc54766bSSrikanth Yalavarthi 644fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 645fc54766bSSrikanth Yalavarthi return -EINVAL; 646fc54766bSSrikanth Yalavarthi 647fc54766bSSrikanth Yalavarthi input_buffer = (float32_t *)input; 648fc54766bSSrikanth Yalavarthi output_buffer = (float16_t *)output; 649fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 650fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 651fc54766bSSrikanth Yalavarthi 652fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 653fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 654fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 655fc54766bSSrikanth Yalavarthi input_buffer += vlen; 656fc54766bSSrikanth Yalavarthi output_buffer += vlen; 657fc54766bSSrikanth Yalavarthi } 658fc54766bSSrikanth Yalavarthi 659fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 660fc54766bSSrikanth Yalavarthi i = i * vlen; 661fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 662fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 663fc54766bSSrikanth Yalavarthi input_buffer++; 664fc54766bSSrikanth Yalavarthi output_buffer++; 665fc54766bSSrikanth Yalavarthi } 666fc54766bSSrikanth Yalavarthi 667fc54766bSSrikanth Yalavarthi return 0; 668fc54766bSSrikanth Yalavarthi } 669fc54766bSSrikanth Yalavarthi 670fc54766bSSrikanth Yalavarthi static inline void 671fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output) 672fc54766bSSrikanth Yalavarthi { 673fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 674fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 675fc54766bSSrikanth Yalavarthi 676fc54766bSSrikanth Yalavarthi /* load 4 x float16_t elements */ 677fc54766bSSrikanth Yalavarthi f16x4 = vld1_f16(input); 678fc54766bSSrikanth Yalavarthi 679fc54766bSSrikanth Yalavarthi /* convert float16x4_t to float32x4_t */ 680fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 681fc54766bSSrikanth Yalavarthi 682fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 683fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 684fc54766bSSrikanth Yalavarthi } 685fc54766bSSrikanth Yalavarthi 686fc54766bSSrikanth Yalavarthi static inline void 687fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output) 688fc54766bSSrikanth Yalavarthi { 689fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 690fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 691fc54766bSSrikanth Yalavarthi 692fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 693fc54766bSSrikanth Yalavarthi f16x4 = vld1_dup_f16(input); 694fc54766bSSrikanth Yalavarthi 695fc54766bSSrikanth Yalavarthi /* convert float16_t to float32_t */ 696fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 697fc54766bSSrikanth Yalavarthi 698fc54766bSSrikanth Yalavarthi /* store 1 element */ 699fc54766bSSrikanth Yalavarthi vst1q_lane_f32(output, f32x4, 0); 700fc54766bSSrikanth Yalavarthi } 701fc54766bSSrikanth Yalavarthi 702fc54766bSSrikanth Yalavarthi int 703fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output) 704fc54766bSSrikanth Yalavarthi { 705fc54766bSSrikanth Yalavarthi float16_t *input_buffer; 706fc54766bSSrikanth Yalavarthi float32_t *output_buffer; 707fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 708fc54766bSSrikanth Yalavarthi uint32_t vlen; 709fc54766bSSrikanth Yalavarthi uint64_t i; 710fc54766bSSrikanth Yalavarthi 711fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 712fc54766bSSrikanth Yalavarthi return -EINVAL; 713fc54766bSSrikanth Yalavarthi 714fc54766bSSrikanth Yalavarthi input_buffer = (float16_t *)input; 715fc54766bSSrikanth Yalavarthi output_buffer = (float32_t *)output; 716fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 717fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 718fc54766bSSrikanth Yalavarthi 719fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 720fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 721fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 722fc54766bSSrikanth Yalavarthi input_buffer += vlen; 723fc54766bSSrikanth Yalavarthi output_buffer += vlen; 724fc54766bSSrikanth Yalavarthi } 725fc54766bSSrikanth Yalavarthi 726fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 727fc54766bSSrikanth Yalavarthi i = i * vlen; 728fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 729fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 730fc54766bSSrikanth Yalavarthi input_buffer++; 731fc54766bSSrikanth Yalavarthi output_buffer++; 732fc54766bSSrikanth Yalavarthi } 733fc54766bSSrikanth Yalavarthi 734fc54766bSSrikanth Yalavarthi return 0; 735fc54766bSSrikanth Yalavarthi } 736