1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause 2fc54766bSSrikanth Yalavarthi * Copyright (c) 2022 Marvell. 3fc54766bSSrikanth Yalavarthi */ 4fc54766bSSrikanth Yalavarthi 5fc54766bSSrikanth Yalavarthi #include <errno.h> 6fc54766bSSrikanth Yalavarthi #include <stdint.h> 7fc54766bSSrikanth Yalavarthi #include <stdlib.h> 8fc54766bSSrikanth Yalavarthi 9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h" 10fc54766bSSrikanth Yalavarthi 11fc54766bSSrikanth Yalavarthi #include <arm_neon.h> 12fc54766bSSrikanth Yalavarthi 13fc54766bSSrikanth Yalavarthi /* Description: 14fc54766bSSrikanth Yalavarthi * This file implements vector versions of Machine Learning utility functions used to convert data 15538f6997SSrikanth Yalavarthi * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16538f6997SSrikanth Yalavarthi * is based on Arm Neon intrinsics. 17fc54766bSSrikanth Yalavarthi */ 18fc54766bSSrikanth Yalavarthi 19fc54766bSSrikanth Yalavarthi static inline void 20fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output) 21fc54766bSSrikanth Yalavarthi { 22fc54766bSSrikanth Yalavarthi int16x4_t s16x4_l; 23fc54766bSSrikanth Yalavarthi int16x4_t s16x4_h; 24fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 25fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 26fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 27fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 28fc54766bSSrikanth Yalavarthi 29fc54766bSSrikanth Yalavarthi /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 31fc54766bSSrikanth Yalavarthi */ 32fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 33fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 34fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 35fc54766bSSrikanth Yalavarthi s16x4_l = vqmovn_s32(s32x4); 36fc54766bSSrikanth Yalavarthi 37fc54766bSSrikanth Yalavarthi /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 38fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 39fc54766bSSrikanth Yalavarthi */ 40fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 41fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 42fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 43fc54766bSSrikanth Yalavarthi s16x4_h = vqmovn_s32(s32x4); 44fc54766bSSrikanth Yalavarthi 45fc54766bSSrikanth Yalavarthi /* combine lower and higher int16x4_t to int16x8_t */ 46fc54766bSSrikanth Yalavarthi s16x8 = vcombine_s16(s16x4_l, s16x4_h); 47fc54766bSSrikanth Yalavarthi 48fc54766bSSrikanth Yalavarthi /* narrow to int8_t */ 49fc54766bSSrikanth Yalavarthi s8x8 = vqmovn_s16(s16x8); 50fc54766bSSrikanth Yalavarthi 51fc54766bSSrikanth Yalavarthi /* store 8 elements */ 52fc54766bSSrikanth Yalavarthi vst1_s8(output, s8x8); 53fc54766bSSrikanth Yalavarthi } 54fc54766bSSrikanth Yalavarthi 55fc54766bSSrikanth Yalavarthi static inline void 56fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output) 57fc54766bSSrikanth Yalavarthi { 58fc54766bSSrikanth Yalavarthi int32_t s32; 59fc54766bSSrikanth Yalavarthi int16_t s16; 60fc54766bSSrikanth Yalavarthi 61fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 62fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 63fc54766bSSrikanth Yalavarthi 64fc54766bSSrikanth Yalavarthi /* saturate narrow */ 65fc54766bSSrikanth Yalavarthi s16 = vqmovns_s32(s32); 66fc54766bSSrikanth Yalavarthi 67fc54766bSSrikanth Yalavarthi /* convert to int8_t */ 68fc54766bSSrikanth Yalavarthi *output = vqmovnh_s16(s16); 69fc54766bSSrikanth Yalavarthi } 70fc54766bSSrikanth Yalavarthi 71fc54766bSSrikanth Yalavarthi int 72fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output) 73fc54766bSSrikanth Yalavarthi { 74fc54766bSSrikanth Yalavarthi float *input_buffer; 75fc54766bSSrikanth Yalavarthi int8_t *output_buffer; 76fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 77fc54766bSSrikanth Yalavarthi uint32_t vlen; 78fc54766bSSrikanth Yalavarthi uint64_t i; 79fc54766bSSrikanth Yalavarthi 80fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 81fc54766bSSrikanth Yalavarthi return -EINVAL; 82fc54766bSSrikanth Yalavarthi 83fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 84fc54766bSSrikanth Yalavarthi output_buffer = (int8_t *)output; 85fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 86fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 87fc54766bSSrikanth Yalavarthi 88fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 89fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 90fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer); 91fc54766bSSrikanth Yalavarthi input_buffer += vlen; 92fc54766bSSrikanth Yalavarthi output_buffer += vlen; 93fc54766bSSrikanth Yalavarthi } 94fc54766bSSrikanth Yalavarthi 95fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 96fc54766bSSrikanth Yalavarthi i = i * vlen; 97fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 98fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer); 99fc54766bSSrikanth Yalavarthi input_buffer++; 100fc54766bSSrikanth Yalavarthi output_buffer++; 101fc54766bSSrikanth Yalavarthi } 102fc54766bSSrikanth Yalavarthi 103fc54766bSSrikanth Yalavarthi return 0; 104fc54766bSSrikanth Yalavarthi } 105fc54766bSSrikanth Yalavarthi 106fc54766bSSrikanth Yalavarthi static inline void 107fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output) 108fc54766bSSrikanth Yalavarthi { 109fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 110fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 111fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 112fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 113fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 114fc54766bSSrikanth Yalavarthi 115fc54766bSSrikanth Yalavarthi /* load 8 x int8_t elements */ 116fc54766bSSrikanth Yalavarthi s8x8 = vld1_s8(input); 117fc54766bSSrikanth Yalavarthi 118fc54766bSSrikanth Yalavarthi /* widen int8_t to int16_t */ 119fc54766bSSrikanth Yalavarthi s16x8 = vmovl_s8(s8x8); 120fc54766bSSrikanth Yalavarthi 121fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 122fc54766bSSrikanth Yalavarthi s16x4 = vget_low_s16(s16x8); 123fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 124fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 125fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 126fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 127fc54766bSSrikanth Yalavarthi 128fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 129fc54766bSSrikanth Yalavarthi s16x4 = vget_high_s16(s16x8); 130fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 131fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 132fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 133fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 134fc54766bSSrikanth Yalavarthi } 135fc54766bSSrikanth Yalavarthi 136fc54766bSSrikanth Yalavarthi static inline void 137fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output) 138fc54766bSSrikanth Yalavarthi { 139fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 140fc54766bSSrikanth Yalavarthi } 141fc54766bSSrikanth Yalavarthi 142fc54766bSSrikanth Yalavarthi int 143fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 144fc54766bSSrikanth Yalavarthi { 145fc54766bSSrikanth Yalavarthi int8_t *input_buffer; 146fc54766bSSrikanth Yalavarthi float *output_buffer; 147fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 148fc54766bSSrikanth Yalavarthi uint32_t vlen; 149fc54766bSSrikanth Yalavarthi uint64_t i; 150fc54766bSSrikanth Yalavarthi 151fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 152fc54766bSSrikanth Yalavarthi return -EINVAL; 153fc54766bSSrikanth Yalavarthi 154fc54766bSSrikanth Yalavarthi input_buffer = (int8_t *)input; 155fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 156fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 157fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 158fc54766bSSrikanth Yalavarthi 159fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 160fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 161fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 162fc54766bSSrikanth Yalavarthi input_buffer += vlen; 163fc54766bSSrikanth Yalavarthi output_buffer += vlen; 164fc54766bSSrikanth Yalavarthi } 165fc54766bSSrikanth Yalavarthi 166fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 167fc54766bSSrikanth Yalavarthi i = i * vlen; 168fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 169fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 170fc54766bSSrikanth Yalavarthi input_buffer++; 171fc54766bSSrikanth Yalavarthi output_buffer++; 172fc54766bSSrikanth Yalavarthi } 173fc54766bSSrikanth Yalavarthi 174fc54766bSSrikanth Yalavarthi return 0; 175fc54766bSSrikanth Yalavarthi } 176fc54766bSSrikanth Yalavarthi 177fc54766bSSrikanth Yalavarthi static inline void 178fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output) 179fc54766bSSrikanth Yalavarthi { 180fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_l; 181fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_h; 182fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 183fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 184fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 185fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 186fc54766bSSrikanth Yalavarthi 187fc54766bSSrikanth Yalavarthi /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 188fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 189fc54766bSSrikanth Yalavarthi */ 190fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 191fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 192fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 193fc54766bSSrikanth Yalavarthi u16x4_l = vqmovn_u32(u32x4); 194fc54766bSSrikanth Yalavarthi 195fc54766bSSrikanth Yalavarthi /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 196fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 197fc54766bSSrikanth Yalavarthi */ 198fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 199fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 200fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 201fc54766bSSrikanth Yalavarthi u16x4_h = vqmovn_u32(u32x4); 202fc54766bSSrikanth Yalavarthi 203fc54766bSSrikanth Yalavarthi /* combine lower and higher uint16x4_t */ 204fc54766bSSrikanth Yalavarthi u16x8 = vcombine_u16(u16x4_l, u16x4_h); 205fc54766bSSrikanth Yalavarthi 206fc54766bSSrikanth Yalavarthi /* narrow to uint8x8_t */ 207fc54766bSSrikanth Yalavarthi u8x8 = vqmovn_u16(u16x8); 208fc54766bSSrikanth Yalavarthi 209fc54766bSSrikanth Yalavarthi /* store 8 elements */ 210fc54766bSSrikanth Yalavarthi vst1_u8(output, u8x8); 211fc54766bSSrikanth Yalavarthi } 212fc54766bSSrikanth Yalavarthi 213fc54766bSSrikanth Yalavarthi static inline void 214fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output) 215fc54766bSSrikanth Yalavarthi { 216fc54766bSSrikanth Yalavarthi uint32_t u32; 217fc54766bSSrikanth Yalavarthi uint16_t u16; 218fc54766bSSrikanth Yalavarthi 219fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 220fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 221fc54766bSSrikanth Yalavarthi 222fc54766bSSrikanth Yalavarthi /* saturate narrow */ 223fc54766bSSrikanth Yalavarthi u16 = vqmovns_u32(u32); 224fc54766bSSrikanth Yalavarthi 225fc54766bSSrikanth Yalavarthi /* convert to uint8_t */ 226fc54766bSSrikanth Yalavarthi *output = vqmovnh_u16(u16); 227fc54766bSSrikanth Yalavarthi } 228fc54766bSSrikanth Yalavarthi 229fc54766bSSrikanth Yalavarthi int 230fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output) 231fc54766bSSrikanth Yalavarthi { 232fc54766bSSrikanth Yalavarthi float *input_buffer; 233fc54766bSSrikanth Yalavarthi uint8_t *output_buffer; 234fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 235fc54766bSSrikanth Yalavarthi uint32_t vlen; 236fc54766bSSrikanth Yalavarthi uint64_t i; 237fc54766bSSrikanth Yalavarthi 238fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 239fc54766bSSrikanth Yalavarthi return -EINVAL; 240fc54766bSSrikanth Yalavarthi 241fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 242fc54766bSSrikanth Yalavarthi output_buffer = (uint8_t *)output; 243fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 244fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 245fc54766bSSrikanth Yalavarthi 246fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 247fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 248fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer); 249fc54766bSSrikanth Yalavarthi input_buffer += vlen; 250fc54766bSSrikanth Yalavarthi output_buffer += vlen; 251fc54766bSSrikanth Yalavarthi } 252fc54766bSSrikanth Yalavarthi 253fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 254fc54766bSSrikanth Yalavarthi i = i * vlen; 255fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 256fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer); 257fc54766bSSrikanth Yalavarthi input_buffer++; 258fc54766bSSrikanth Yalavarthi output_buffer++; 259fc54766bSSrikanth Yalavarthi } 260fc54766bSSrikanth Yalavarthi 261fc54766bSSrikanth Yalavarthi return 0; 262fc54766bSSrikanth Yalavarthi } 263fc54766bSSrikanth Yalavarthi 264fc54766bSSrikanth Yalavarthi static inline void 265fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output) 266fc54766bSSrikanth Yalavarthi { 267fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 268fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 269fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 270fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 271fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 272fc54766bSSrikanth Yalavarthi 273fc54766bSSrikanth Yalavarthi /* load 8 x uint8_t elements */ 274fc54766bSSrikanth Yalavarthi u8x8 = vld1_u8(input); 275fc54766bSSrikanth Yalavarthi 276fc54766bSSrikanth Yalavarthi /* widen uint8_t to uint16_t */ 277fc54766bSSrikanth Yalavarthi u16x8 = vmovl_u8(u8x8); 278fc54766bSSrikanth Yalavarthi 279fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 280fc54766bSSrikanth Yalavarthi u16x4 = vget_low_u16(u16x8); 281fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 282fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 283fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 284fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 285fc54766bSSrikanth Yalavarthi 286fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 287fc54766bSSrikanth Yalavarthi u16x4 = vget_high_u16(u16x8); 288fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 289fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 290fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 291fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 292fc54766bSSrikanth Yalavarthi } 293fc54766bSSrikanth Yalavarthi 294fc54766bSSrikanth Yalavarthi static inline void 295fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output) 296fc54766bSSrikanth Yalavarthi { 297fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 298fc54766bSSrikanth Yalavarthi } 299fc54766bSSrikanth Yalavarthi 300fc54766bSSrikanth Yalavarthi int 301fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 302fc54766bSSrikanth Yalavarthi { 303fc54766bSSrikanth Yalavarthi uint8_t *input_buffer; 304fc54766bSSrikanth Yalavarthi float *output_buffer; 305fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 306fc54766bSSrikanth Yalavarthi uint64_t vlen; 307fc54766bSSrikanth Yalavarthi uint64_t i; 308fc54766bSSrikanth Yalavarthi 309fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 310fc54766bSSrikanth Yalavarthi return -EINVAL; 311fc54766bSSrikanth Yalavarthi 312fc54766bSSrikanth Yalavarthi input_buffer = (uint8_t *)input; 313fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 314fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 315fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 316fc54766bSSrikanth Yalavarthi 317fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 318fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 319fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 320fc54766bSSrikanth Yalavarthi input_buffer += vlen; 321fc54766bSSrikanth Yalavarthi output_buffer += vlen; 322fc54766bSSrikanth Yalavarthi } 323fc54766bSSrikanth Yalavarthi 324fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 325fc54766bSSrikanth Yalavarthi i = i * vlen; 326fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 327fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 328fc54766bSSrikanth Yalavarthi input_buffer++; 329fc54766bSSrikanth Yalavarthi output_buffer++; 330fc54766bSSrikanth Yalavarthi } 331fc54766bSSrikanth Yalavarthi 332fc54766bSSrikanth Yalavarthi return 0; 333fc54766bSSrikanth Yalavarthi } 334fc54766bSSrikanth Yalavarthi 335fc54766bSSrikanth Yalavarthi static inline void 336fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output) 337fc54766bSSrikanth Yalavarthi { 338fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 339fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 340fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 341fc54766bSSrikanth Yalavarthi 342fc54766bSSrikanth Yalavarthi /* load 4 x float elements */ 343fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 344fc54766bSSrikanth Yalavarthi 345fc54766bSSrikanth Yalavarthi /* scale */ 346fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 347fc54766bSSrikanth Yalavarthi 348fc54766bSSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 349fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 350fc54766bSSrikanth Yalavarthi 351fc54766bSSrikanth Yalavarthi /* saturate narrow to int16x4_t */ 352fc54766bSSrikanth Yalavarthi s16x4 = vqmovn_s32(s32x4); 353fc54766bSSrikanth Yalavarthi 354fc54766bSSrikanth Yalavarthi /* store 4 elements */ 355fc54766bSSrikanth Yalavarthi vst1_s16(output, s16x4); 356fc54766bSSrikanth Yalavarthi } 357fc54766bSSrikanth Yalavarthi 358fc54766bSSrikanth Yalavarthi static inline void 359fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output) 360fc54766bSSrikanth Yalavarthi { 361fc54766bSSrikanth Yalavarthi int32_t s32; 362fc54766bSSrikanth Yalavarthi 363fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 364fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 365fc54766bSSrikanth Yalavarthi 366fc54766bSSrikanth Yalavarthi /* saturate narrow */ 367fc54766bSSrikanth Yalavarthi *output = vqmovns_s32(s32); 368fc54766bSSrikanth Yalavarthi } 369fc54766bSSrikanth Yalavarthi 370fc54766bSSrikanth Yalavarthi int 371fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output) 372fc54766bSSrikanth Yalavarthi { 373fc54766bSSrikanth Yalavarthi float *input_buffer; 374fc54766bSSrikanth Yalavarthi int16_t *output_buffer; 375fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 376fc54766bSSrikanth Yalavarthi uint32_t vlen; 377fc54766bSSrikanth Yalavarthi uint64_t i; 378fc54766bSSrikanth Yalavarthi 379fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 380fc54766bSSrikanth Yalavarthi return -EINVAL; 381fc54766bSSrikanth Yalavarthi 382fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 383fc54766bSSrikanth Yalavarthi output_buffer = (int16_t *)output; 384fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 385fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 386fc54766bSSrikanth Yalavarthi 387fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 388fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 389fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer); 390fc54766bSSrikanth Yalavarthi input_buffer += vlen; 391fc54766bSSrikanth Yalavarthi output_buffer += vlen; 392fc54766bSSrikanth Yalavarthi } 393fc54766bSSrikanth Yalavarthi 394fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 395fc54766bSSrikanth Yalavarthi i = i * vlen; 396fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 397fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer); 398fc54766bSSrikanth Yalavarthi input_buffer++; 399fc54766bSSrikanth Yalavarthi output_buffer++; 400fc54766bSSrikanth Yalavarthi } 401fc54766bSSrikanth Yalavarthi 402fc54766bSSrikanth Yalavarthi return 0; 403fc54766bSSrikanth Yalavarthi } 404fc54766bSSrikanth Yalavarthi 405fc54766bSSrikanth Yalavarthi static inline void 406fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output) 407fc54766bSSrikanth Yalavarthi { 408fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 409fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 410fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 411fc54766bSSrikanth Yalavarthi 412fc54766bSSrikanth Yalavarthi /* load 4 x int16_t elements */ 413fc54766bSSrikanth Yalavarthi s16x4 = vld1_s16(input); 414fc54766bSSrikanth Yalavarthi 415fc54766bSSrikanth Yalavarthi /* widen int16_t to int32_t */ 416fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 417fc54766bSSrikanth Yalavarthi 418fc54766bSSrikanth Yalavarthi /* convert int32_t to float */ 419fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 420fc54766bSSrikanth Yalavarthi 421fc54766bSSrikanth Yalavarthi /* scale */ 422fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 423fc54766bSSrikanth Yalavarthi 424fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 425fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 426fc54766bSSrikanth Yalavarthi } 427fc54766bSSrikanth Yalavarthi 428fc54766bSSrikanth Yalavarthi static inline void 429fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output) 430fc54766bSSrikanth Yalavarthi { 431fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 432fc54766bSSrikanth Yalavarthi } 433fc54766bSSrikanth Yalavarthi 434fc54766bSSrikanth Yalavarthi int 435fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 436fc54766bSSrikanth Yalavarthi { 437fc54766bSSrikanth Yalavarthi int16_t *input_buffer; 438fc54766bSSrikanth Yalavarthi float *output_buffer; 439fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 440fc54766bSSrikanth Yalavarthi uint32_t vlen; 441fc54766bSSrikanth Yalavarthi uint64_t i; 442fc54766bSSrikanth Yalavarthi 443fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 444fc54766bSSrikanth Yalavarthi return -EINVAL; 445fc54766bSSrikanth Yalavarthi 446fc54766bSSrikanth Yalavarthi input_buffer = (int16_t *)input; 447fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 448fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 449fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 450fc54766bSSrikanth Yalavarthi 451fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 452fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 453fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 454fc54766bSSrikanth Yalavarthi input_buffer += vlen; 455fc54766bSSrikanth Yalavarthi output_buffer += vlen; 456fc54766bSSrikanth Yalavarthi } 457fc54766bSSrikanth Yalavarthi 458fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 459fc54766bSSrikanth Yalavarthi i = i * vlen; 460fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 461fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 462fc54766bSSrikanth Yalavarthi input_buffer++; 463fc54766bSSrikanth Yalavarthi output_buffer++; 464fc54766bSSrikanth Yalavarthi } 465fc54766bSSrikanth Yalavarthi 466fc54766bSSrikanth Yalavarthi return 0; 467fc54766bSSrikanth Yalavarthi } 468fc54766bSSrikanth Yalavarthi 469fc54766bSSrikanth Yalavarthi static inline void 470fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output) 471fc54766bSSrikanth Yalavarthi { 472fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 473fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 474fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 475fc54766bSSrikanth Yalavarthi 476fc54766bSSrikanth Yalavarthi /* load 4 float elements */ 477fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 478fc54766bSSrikanth Yalavarthi 479fc54766bSSrikanth Yalavarthi /* scale */ 480fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 481fc54766bSSrikanth Yalavarthi 482fc54766bSSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 483fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 484fc54766bSSrikanth Yalavarthi 485fc54766bSSrikanth Yalavarthi /* saturate narrow */ 486fc54766bSSrikanth Yalavarthi u16x4 = vqmovn_u32(u32x4); 487fc54766bSSrikanth Yalavarthi 488fc54766bSSrikanth Yalavarthi /* store 4 elements */ 489fc54766bSSrikanth Yalavarthi vst1_u16(output, u16x4); 490fc54766bSSrikanth Yalavarthi } 491fc54766bSSrikanth Yalavarthi 492fc54766bSSrikanth Yalavarthi static inline void 493fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output) 494fc54766bSSrikanth Yalavarthi { 495fc54766bSSrikanth Yalavarthi uint32_t u32; 496fc54766bSSrikanth Yalavarthi 497fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 498fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 499fc54766bSSrikanth Yalavarthi 500fc54766bSSrikanth Yalavarthi /* saturate narrow */ 501fc54766bSSrikanth Yalavarthi *output = vqmovns_u32(u32); 502fc54766bSSrikanth Yalavarthi } 503fc54766bSSrikanth Yalavarthi 504fc54766bSSrikanth Yalavarthi int 505fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output) 506fc54766bSSrikanth Yalavarthi { 507fc54766bSSrikanth Yalavarthi float *input_buffer; 508fc54766bSSrikanth Yalavarthi uint16_t *output_buffer; 509fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 510fc54766bSSrikanth Yalavarthi uint64_t vlen; 511fc54766bSSrikanth Yalavarthi uint64_t i; 512fc54766bSSrikanth Yalavarthi 513fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 514fc54766bSSrikanth Yalavarthi return -EINVAL; 515fc54766bSSrikanth Yalavarthi 516fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 517fc54766bSSrikanth Yalavarthi output_buffer = (uint16_t *)output; 518fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 519fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 520fc54766bSSrikanth Yalavarthi 521fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 522fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 523fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer); 524fc54766bSSrikanth Yalavarthi input_buffer += vlen; 525fc54766bSSrikanth Yalavarthi output_buffer += vlen; 526fc54766bSSrikanth Yalavarthi } 527fc54766bSSrikanth Yalavarthi 528fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 529fc54766bSSrikanth Yalavarthi i = i * vlen; 530fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 531fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer); 532fc54766bSSrikanth Yalavarthi input_buffer++; 533fc54766bSSrikanth Yalavarthi output_buffer++; 534fc54766bSSrikanth Yalavarthi } 535fc54766bSSrikanth Yalavarthi 536fc54766bSSrikanth Yalavarthi return 0; 537fc54766bSSrikanth Yalavarthi } 538fc54766bSSrikanth Yalavarthi 539fc54766bSSrikanth Yalavarthi static inline void 540fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output) 541fc54766bSSrikanth Yalavarthi { 542fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 543fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 544fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 545fc54766bSSrikanth Yalavarthi 546fc54766bSSrikanth Yalavarthi /* load 4 x uint16_t elements */ 547fc54766bSSrikanth Yalavarthi u16x4 = vld1_u16(input); 548fc54766bSSrikanth Yalavarthi 549fc54766bSSrikanth Yalavarthi /* widen uint16_t to uint32_t */ 550fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 551fc54766bSSrikanth Yalavarthi 552fc54766bSSrikanth Yalavarthi /* convert uint32_t to float */ 553fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 554fc54766bSSrikanth Yalavarthi 555fc54766bSSrikanth Yalavarthi /* scale */ 556fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 557fc54766bSSrikanth Yalavarthi 558fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 559fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 560fc54766bSSrikanth Yalavarthi } 561fc54766bSSrikanth Yalavarthi 562fc54766bSSrikanth Yalavarthi static inline void 563fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output) 564fc54766bSSrikanth Yalavarthi { 565fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 566fc54766bSSrikanth Yalavarthi } 567fc54766bSSrikanth Yalavarthi 568fc54766bSSrikanth Yalavarthi int 569fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 570fc54766bSSrikanth Yalavarthi { 571fc54766bSSrikanth Yalavarthi uint16_t *input_buffer; 572fc54766bSSrikanth Yalavarthi float *output_buffer; 573fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 574fc54766bSSrikanth Yalavarthi uint32_t vlen; 575fc54766bSSrikanth Yalavarthi uint64_t i; 576fc54766bSSrikanth Yalavarthi 577fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 578fc54766bSSrikanth Yalavarthi return -EINVAL; 579fc54766bSSrikanth Yalavarthi 580fc54766bSSrikanth Yalavarthi input_buffer = (uint16_t *)input; 581fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 582fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 583fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 584fc54766bSSrikanth Yalavarthi 585fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 586fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 587fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 588fc54766bSSrikanth Yalavarthi input_buffer += vlen; 589fc54766bSSrikanth Yalavarthi output_buffer += vlen; 590fc54766bSSrikanth Yalavarthi } 591fc54766bSSrikanth Yalavarthi 592fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 593fc54766bSSrikanth Yalavarthi i = i * vlen; 594fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 595fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 596fc54766bSSrikanth Yalavarthi input_buffer++; 597fc54766bSSrikanth Yalavarthi output_buffer++; 598fc54766bSSrikanth Yalavarthi } 599fc54766bSSrikanth Yalavarthi 600fc54766bSSrikanth Yalavarthi return 0; 601fc54766bSSrikanth Yalavarthi } 602fc54766bSSrikanth Yalavarthi 603fc54766bSSrikanth Yalavarthi static inline void 60450513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output) 60550513ae5SSrikanth Yalavarthi { 60650513ae5SSrikanth Yalavarthi float32x4_t f32x4; 60750513ae5SSrikanth Yalavarthi int32x4_t s32x4; 60850513ae5SSrikanth Yalavarthi 60950513ae5SSrikanth Yalavarthi /* load 4 x float elements */ 61050513ae5SSrikanth Yalavarthi f32x4 = vld1q_f32(input); 61150513ae5SSrikanth Yalavarthi 61250513ae5SSrikanth Yalavarthi /* scale */ 61350513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 61450513ae5SSrikanth Yalavarthi 61550513ae5SSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 61650513ae5SSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 61750513ae5SSrikanth Yalavarthi 61850513ae5SSrikanth Yalavarthi /* store 4 elements */ 61950513ae5SSrikanth Yalavarthi vst1q_s32(output, s32x4); 62050513ae5SSrikanth Yalavarthi } 62150513ae5SSrikanth Yalavarthi 62250513ae5SSrikanth Yalavarthi static inline void 62350513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output) 62450513ae5SSrikanth Yalavarthi { 62550513ae5SSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 62650513ae5SSrikanth Yalavarthi *output = vcvtas_s32_f32(scale * (*input)); 62750513ae5SSrikanth Yalavarthi } 62850513ae5SSrikanth Yalavarthi 62950513ae5SSrikanth Yalavarthi int 63050513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, void *output) 63150513ae5SSrikanth Yalavarthi { 63250513ae5SSrikanth Yalavarthi float *input_buffer; 63350513ae5SSrikanth Yalavarthi int32_t *output_buffer; 63450513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 63550513ae5SSrikanth Yalavarthi uint32_t vlen; 63650513ae5SSrikanth Yalavarthi uint64_t i; 63750513ae5SSrikanth Yalavarthi 63850513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 63950513ae5SSrikanth Yalavarthi return -EINVAL; 64050513ae5SSrikanth Yalavarthi 64150513ae5SSrikanth Yalavarthi input_buffer = (float *)input; 64250513ae5SSrikanth Yalavarthi output_buffer = (int32_t *)output; 64350513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int32_t); 64450513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 64550513ae5SSrikanth Yalavarthi 64650513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 64750513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 64850513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x4(scale, input_buffer, output_buffer); 64950513ae5SSrikanth Yalavarthi input_buffer += vlen; 65050513ae5SSrikanth Yalavarthi output_buffer += vlen; 65150513ae5SSrikanth Yalavarthi } 65250513ae5SSrikanth Yalavarthi 65350513ae5SSrikanth Yalavarthi /* convert leftover elements */ 65450513ae5SSrikanth Yalavarthi i = i * vlen; 65550513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 65650513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x1(scale, input_buffer, output_buffer); 65750513ae5SSrikanth Yalavarthi input_buffer++; 65850513ae5SSrikanth Yalavarthi output_buffer++; 65950513ae5SSrikanth Yalavarthi } 66050513ae5SSrikanth Yalavarthi 66150513ae5SSrikanth Yalavarthi return 0; 66250513ae5SSrikanth Yalavarthi } 66350513ae5SSrikanth Yalavarthi 66450513ae5SSrikanth Yalavarthi static inline void 66550513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output) 66650513ae5SSrikanth Yalavarthi { 66750513ae5SSrikanth Yalavarthi float32x4_t f32x4; 66850513ae5SSrikanth Yalavarthi int32x4_t s32x4; 66950513ae5SSrikanth Yalavarthi 67050513ae5SSrikanth Yalavarthi /* load 4 x int32_t elements */ 67150513ae5SSrikanth Yalavarthi s32x4 = vld1q_s32(input); 67250513ae5SSrikanth Yalavarthi 67350513ae5SSrikanth Yalavarthi /* convert int32_t to float */ 67450513ae5SSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 67550513ae5SSrikanth Yalavarthi 67650513ae5SSrikanth Yalavarthi /* scale */ 67750513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 67850513ae5SSrikanth Yalavarthi 67950513ae5SSrikanth Yalavarthi /* store float32x4_t */ 68050513ae5SSrikanth Yalavarthi vst1q_f32(output, f32x4); 68150513ae5SSrikanth Yalavarthi } 68250513ae5SSrikanth Yalavarthi 68350513ae5SSrikanth Yalavarthi static inline void 68450513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output) 68550513ae5SSrikanth Yalavarthi { 68650513ae5SSrikanth Yalavarthi *output = scale * vcvts_f32_s32(*input); 68750513ae5SSrikanth Yalavarthi } 68850513ae5SSrikanth Yalavarthi 68950513ae5SSrikanth Yalavarthi int 69050513ae5SSrikanth Yalavarthi rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 69150513ae5SSrikanth Yalavarthi { 69250513ae5SSrikanth Yalavarthi int32_t *input_buffer; 69350513ae5SSrikanth Yalavarthi float *output_buffer; 69450513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 69550513ae5SSrikanth Yalavarthi uint32_t vlen; 69650513ae5SSrikanth Yalavarthi uint64_t i; 69750513ae5SSrikanth Yalavarthi 69850513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 69950513ae5SSrikanth Yalavarthi return -EINVAL; 70050513ae5SSrikanth Yalavarthi 70150513ae5SSrikanth Yalavarthi input_buffer = (int32_t *)input; 70250513ae5SSrikanth Yalavarthi output_buffer = (float *)output; 70350513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int32_t); 70450513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 70550513ae5SSrikanth Yalavarthi 70650513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 70750513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 70850513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 70950513ae5SSrikanth Yalavarthi input_buffer += vlen; 71050513ae5SSrikanth Yalavarthi output_buffer += vlen; 71150513ae5SSrikanth Yalavarthi } 71250513ae5SSrikanth Yalavarthi 71350513ae5SSrikanth Yalavarthi /* convert leftover elements */ 71450513ae5SSrikanth Yalavarthi i = i * vlen; 71550513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 71650513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 71750513ae5SSrikanth Yalavarthi input_buffer++; 71850513ae5SSrikanth Yalavarthi output_buffer++; 71950513ae5SSrikanth Yalavarthi } 72050513ae5SSrikanth Yalavarthi 72150513ae5SSrikanth Yalavarthi return 0; 72250513ae5SSrikanth Yalavarthi } 72350513ae5SSrikanth Yalavarthi 72450513ae5SSrikanth Yalavarthi static inline void 72550513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output) 72650513ae5SSrikanth Yalavarthi { 72750513ae5SSrikanth Yalavarthi float32x4_t f32x4; 72850513ae5SSrikanth Yalavarthi uint32x4_t u32x4; 72950513ae5SSrikanth Yalavarthi 73050513ae5SSrikanth Yalavarthi /* load 4 float elements */ 73150513ae5SSrikanth Yalavarthi f32x4 = vld1q_f32(input); 73250513ae5SSrikanth Yalavarthi 73350513ae5SSrikanth Yalavarthi /* scale */ 73450513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 73550513ae5SSrikanth Yalavarthi 73650513ae5SSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 73750513ae5SSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 73850513ae5SSrikanth Yalavarthi 73950513ae5SSrikanth Yalavarthi /* store 4 elements */ 74050513ae5SSrikanth Yalavarthi vst1q_u32(output, u32x4); 74150513ae5SSrikanth Yalavarthi } 74250513ae5SSrikanth Yalavarthi 74350513ae5SSrikanth Yalavarthi static inline void 74450513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output) 74550513ae5SSrikanth Yalavarthi { 74650513ae5SSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 74750513ae5SSrikanth Yalavarthi *output = vcvtas_u32_f32(scale * (*input)); 74850513ae5SSrikanth Yalavarthi } 74950513ae5SSrikanth Yalavarthi 75050513ae5SSrikanth Yalavarthi int 75150513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, void *output) 75250513ae5SSrikanth Yalavarthi { 75350513ae5SSrikanth Yalavarthi float *input_buffer; 75450513ae5SSrikanth Yalavarthi uint32_t *output_buffer; 75550513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 75650513ae5SSrikanth Yalavarthi uint64_t vlen; 75750513ae5SSrikanth Yalavarthi uint64_t i; 75850513ae5SSrikanth Yalavarthi 75950513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 76050513ae5SSrikanth Yalavarthi return -EINVAL; 76150513ae5SSrikanth Yalavarthi 76250513ae5SSrikanth Yalavarthi input_buffer = (float *)input; 76350513ae5SSrikanth Yalavarthi output_buffer = (uint32_t *)output; 76450513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint32_t); 76550513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 76650513ae5SSrikanth Yalavarthi 76750513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 76850513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 76950513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(scale, input_buffer, output_buffer); 77050513ae5SSrikanth Yalavarthi input_buffer += vlen; 77150513ae5SSrikanth Yalavarthi output_buffer += vlen; 77250513ae5SSrikanth Yalavarthi } 77350513ae5SSrikanth Yalavarthi 77450513ae5SSrikanth Yalavarthi /* convert leftover elements */ 77550513ae5SSrikanth Yalavarthi i = i * vlen; 77650513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 77750513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(scale, input_buffer, output_buffer); 77850513ae5SSrikanth Yalavarthi input_buffer++; 77950513ae5SSrikanth Yalavarthi output_buffer++; 78050513ae5SSrikanth Yalavarthi } 78150513ae5SSrikanth Yalavarthi 78250513ae5SSrikanth Yalavarthi return 0; 78350513ae5SSrikanth Yalavarthi } 78450513ae5SSrikanth Yalavarthi 78550513ae5SSrikanth Yalavarthi static inline void 78650513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output) 78750513ae5SSrikanth Yalavarthi { 78850513ae5SSrikanth Yalavarthi float32x4_t f32x4; 78950513ae5SSrikanth Yalavarthi uint32x4_t u32x4; 79050513ae5SSrikanth Yalavarthi 79150513ae5SSrikanth Yalavarthi /* load 4 x uint32_t elements */ 79250513ae5SSrikanth Yalavarthi u32x4 = vld1q_u32(input); 79350513ae5SSrikanth Yalavarthi 79450513ae5SSrikanth Yalavarthi /* convert uint32_t to float */ 79550513ae5SSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 79650513ae5SSrikanth Yalavarthi 79750513ae5SSrikanth Yalavarthi /* scale */ 79850513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 79950513ae5SSrikanth Yalavarthi 80050513ae5SSrikanth Yalavarthi /* store float32x4_t */ 80150513ae5SSrikanth Yalavarthi vst1q_f32(output, f32x4); 80250513ae5SSrikanth Yalavarthi } 80350513ae5SSrikanth Yalavarthi 80450513ae5SSrikanth Yalavarthi static inline void 80550513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output) 80650513ae5SSrikanth Yalavarthi { 80750513ae5SSrikanth Yalavarthi *output = scale * vcvts_f32_u32(*input); 80850513ae5SSrikanth Yalavarthi } 80950513ae5SSrikanth Yalavarthi 81050513ae5SSrikanth Yalavarthi int 81150513ae5SSrikanth Yalavarthi rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 81250513ae5SSrikanth Yalavarthi { 81350513ae5SSrikanth Yalavarthi uint32_t *input_buffer; 81450513ae5SSrikanth Yalavarthi float *output_buffer; 81550513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 81650513ae5SSrikanth Yalavarthi uint32_t vlen; 81750513ae5SSrikanth Yalavarthi uint64_t i; 81850513ae5SSrikanth Yalavarthi 81950513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 82050513ae5SSrikanth Yalavarthi return -EINVAL; 82150513ae5SSrikanth Yalavarthi 82250513ae5SSrikanth Yalavarthi input_buffer = (uint32_t *)input; 82350513ae5SSrikanth Yalavarthi output_buffer = (float *)output; 82450513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint32_t); 82550513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 82650513ae5SSrikanth Yalavarthi 82750513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 82850513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 82950513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 83050513ae5SSrikanth Yalavarthi input_buffer += vlen; 83150513ae5SSrikanth Yalavarthi output_buffer += vlen; 83250513ae5SSrikanth Yalavarthi } 83350513ae5SSrikanth Yalavarthi 83450513ae5SSrikanth Yalavarthi /* convert leftover elements */ 83550513ae5SSrikanth Yalavarthi i = i * vlen; 83650513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 83750513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 83850513ae5SSrikanth Yalavarthi input_buffer++; 83950513ae5SSrikanth Yalavarthi output_buffer++; 84050513ae5SSrikanth Yalavarthi } 84150513ae5SSrikanth Yalavarthi 84250513ae5SSrikanth Yalavarthi return 0; 84350513ae5SSrikanth Yalavarthi } 84450513ae5SSrikanth Yalavarthi 84550513ae5SSrikanth Yalavarthi static inline void 846*42f3dcd9SSrikanth Yalavarthi __float32_to_int64_neon_s64x2(float scale, float *input, int64_t *output) 847*42f3dcd9SSrikanth Yalavarthi { 848*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 849*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 850*42f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 851*42f3dcd9SSrikanth Yalavarthi 852*42f3dcd9SSrikanth Yalavarthi /* load 2 x float elements */ 853*42f3dcd9SSrikanth Yalavarthi f32x2 = vld1_f32(input); 854*42f3dcd9SSrikanth Yalavarthi 855*42f3dcd9SSrikanth Yalavarthi /* scale */ 856*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 857*42f3dcd9SSrikanth Yalavarthi 858*42f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 859*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 860*42f3dcd9SSrikanth Yalavarthi 861*42f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 862*42f3dcd9SSrikanth Yalavarthi s64x2 = vcvtaq_s64_f64(f64x2); 863*42f3dcd9SSrikanth Yalavarthi 864*42f3dcd9SSrikanth Yalavarthi /* store 2 elements */ 865*42f3dcd9SSrikanth Yalavarthi vst1q_s64(output, s64x2); 866*42f3dcd9SSrikanth Yalavarthi } 867*42f3dcd9SSrikanth Yalavarthi 868*42f3dcd9SSrikanth Yalavarthi static inline void 869*42f3dcd9SSrikanth Yalavarthi __float32_to_int64_neon_s64x1(float scale, float *input, int64_t *output) 870*42f3dcd9SSrikanth Yalavarthi { 871*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 872*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 873*42f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 874*42f3dcd9SSrikanth Yalavarthi 875*42f3dcd9SSrikanth Yalavarthi /* load 1 x float element */ 876*42f3dcd9SSrikanth Yalavarthi f32x2 = vdup_n_f32(*input); 877*42f3dcd9SSrikanth Yalavarthi 878*42f3dcd9SSrikanth Yalavarthi /* scale */ 879*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 880*42f3dcd9SSrikanth Yalavarthi 881*42f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 882*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 883*42f3dcd9SSrikanth Yalavarthi 884*42f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 885*42f3dcd9SSrikanth Yalavarthi s64x2 = vcvtaq_s64_f64(f64x2); 886*42f3dcd9SSrikanth Yalavarthi 887*42f3dcd9SSrikanth Yalavarthi /* store lane 0 of int64x2_t */ 888*42f3dcd9SSrikanth Yalavarthi vst1q_lane_s64(output, s64x2, 0); 889*42f3dcd9SSrikanth Yalavarthi } 890*42f3dcd9SSrikanth Yalavarthi 891*42f3dcd9SSrikanth Yalavarthi int 892*42f3dcd9SSrikanth Yalavarthi rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, void *output) 893*42f3dcd9SSrikanth Yalavarthi { 894*42f3dcd9SSrikanth Yalavarthi float *input_buffer; 895*42f3dcd9SSrikanth Yalavarthi int64_t *output_buffer; 896*42f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 897*42f3dcd9SSrikanth Yalavarthi uint32_t vlen; 898*42f3dcd9SSrikanth Yalavarthi uint64_t i; 899*42f3dcd9SSrikanth Yalavarthi 900*42f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 901*42f3dcd9SSrikanth Yalavarthi return -EINVAL; 902*42f3dcd9SSrikanth Yalavarthi 903*42f3dcd9SSrikanth Yalavarthi input_buffer = (float *)input; 904*42f3dcd9SSrikanth Yalavarthi output_buffer = (int64_t *)output; 905*42f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(int64_t); 906*42f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 907*42f3dcd9SSrikanth Yalavarthi 908*42f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 909*42f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 910*42f3dcd9SSrikanth Yalavarthi __float32_to_int64_neon_s64x2(scale, input_buffer, output_buffer); 911*42f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 912*42f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 913*42f3dcd9SSrikanth Yalavarthi } 914*42f3dcd9SSrikanth Yalavarthi 915*42f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 916*42f3dcd9SSrikanth Yalavarthi i = i * vlen; 917*42f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 918*42f3dcd9SSrikanth Yalavarthi __float32_to_int64_neon_s64x1(scale, input_buffer, output_buffer); 919*42f3dcd9SSrikanth Yalavarthi input_buffer++; 920*42f3dcd9SSrikanth Yalavarthi output_buffer++; 921*42f3dcd9SSrikanth Yalavarthi } 922*42f3dcd9SSrikanth Yalavarthi 923*42f3dcd9SSrikanth Yalavarthi return 0; 924*42f3dcd9SSrikanth Yalavarthi } 925*42f3dcd9SSrikanth Yalavarthi 926*42f3dcd9SSrikanth Yalavarthi static inline void 927*42f3dcd9SSrikanth Yalavarthi __int64_to_float32_neon_f32x2(float scale, int64_t *input, float *output) 928*42f3dcd9SSrikanth Yalavarthi { 929*42f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 930*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 931*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 932*42f3dcd9SSrikanth Yalavarthi 933*42f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 934*42f3dcd9SSrikanth Yalavarthi s64x2 = vld1q_s64(input); 935*42f3dcd9SSrikanth Yalavarthi 936*42f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 937*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_s64(s64x2); 938*42f3dcd9SSrikanth Yalavarthi 939*42f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 940*42f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 941*42f3dcd9SSrikanth Yalavarthi 942*42f3dcd9SSrikanth Yalavarthi /* scale */ 943*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 944*42f3dcd9SSrikanth Yalavarthi 945*42f3dcd9SSrikanth Yalavarthi /* store float32x2_t */ 946*42f3dcd9SSrikanth Yalavarthi vst1_f32(output, f32x2); 947*42f3dcd9SSrikanth Yalavarthi } 948*42f3dcd9SSrikanth Yalavarthi 949*42f3dcd9SSrikanth Yalavarthi static inline void 950*42f3dcd9SSrikanth Yalavarthi __int64_to_float32_neon_f32x1(float scale, int64_t *input, float *output) 951*42f3dcd9SSrikanth Yalavarthi { 952*42f3dcd9SSrikanth Yalavarthi int64x2_t s64x2; 953*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 954*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 955*42f3dcd9SSrikanth Yalavarthi 956*42f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 957*42f3dcd9SSrikanth Yalavarthi s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0); 958*42f3dcd9SSrikanth Yalavarthi 959*42f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 960*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_s64(s64x2); 961*42f3dcd9SSrikanth Yalavarthi 962*42f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 963*42f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 964*42f3dcd9SSrikanth Yalavarthi 965*42f3dcd9SSrikanth Yalavarthi /* scale */ 966*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 967*42f3dcd9SSrikanth Yalavarthi 968*42f3dcd9SSrikanth Yalavarthi /* store float32x2_t */ 969*42f3dcd9SSrikanth Yalavarthi vst1_lane_f32(output, f32x2, 0); 970*42f3dcd9SSrikanth Yalavarthi } 971*42f3dcd9SSrikanth Yalavarthi 972*42f3dcd9SSrikanth Yalavarthi int 973*42f3dcd9SSrikanth Yalavarthi rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 974*42f3dcd9SSrikanth Yalavarthi { 975*42f3dcd9SSrikanth Yalavarthi int64_t *input_buffer; 976*42f3dcd9SSrikanth Yalavarthi float *output_buffer; 977*42f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 978*42f3dcd9SSrikanth Yalavarthi uint32_t vlen; 979*42f3dcd9SSrikanth Yalavarthi uint64_t i; 980*42f3dcd9SSrikanth Yalavarthi 981*42f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 982*42f3dcd9SSrikanth Yalavarthi return -EINVAL; 983*42f3dcd9SSrikanth Yalavarthi 984*42f3dcd9SSrikanth Yalavarthi input_buffer = (int64_t *)input; 985*42f3dcd9SSrikanth Yalavarthi output_buffer = (float *)output; 986*42f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(int64_t); 987*42f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 988*42f3dcd9SSrikanth Yalavarthi 989*42f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 990*42f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 991*42f3dcd9SSrikanth Yalavarthi __int64_to_float32_neon_f32x2(scale, input_buffer, output_buffer); 992*42f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 993*42f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 994*42f3dcd9SSrikanth Yalavarthi } 995*42f3dcd9SSrikanth Yalavarthi 996*42f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 997*42f3dcd9SSrikanth Yalavarthi i = i * vlen; 998*42f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 999*42f3dcd9SSrikanth Yalavarthi __int64_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 1000*42f3dcd9SSrikanth Yalavarthi input_buffer++; 1001*42f3dcd9SSrikanth Yalavarthi output_buffer++; 1002*42f3dcd9SSrikanth Yalavarthi } 1003*42f3dcd9SSrikanth Yalavarthi 1004*42f3dcd9SSrikanth Yalavarthi return 0; 1005*42f3dcd9SSrikanth Yalavarthi } 1006*42f3dcd9SSrikanth Yalavarthi 1007*42f3dcd9SSrikanth Yalavarthi static inline void 1008*42f3dcd9SSrikanth Yalavarthi __float32_to_uint64_neon_u64x2(float scale, float *input, uint64_t *output) 1009*42f3dcd9SSrikanth Yalavarthi { 1010*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 1011*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 1012*42f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 1013*42f3dcd9SSrikanth Yalavarthi 1014*42f3dcd9SSrikanth Yalavarthi /* load 2 x float elements */ 1015*42f3dcd9SSrikanth Yalavarthi f32x2 = vld1_f32(input); 1016*42f3dcd9SSrikanth Yalavarthi 1017*42f3dcd9SSrikanth Yalavarthi /* scale */ 1018*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 1019*42f3dcd9SSrikanth Yalavarthi 1020*42f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 1021*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 1022*42f3dcd9SSrikanth Yalavarthi 1023*42f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 1024*42f3dcd9SSrikanth Yalavarthi u64x2 = vcvtaq_u64_f64(f64x2); 1025*42f3dcd9SSrikanth Yalavarthi 1026*42f3dcd9SSrikanth Yalavarthi /* store 2 elements */ 1027*42f3dcd9SSrikanth Yalavarthi vst1q_u64(output, u64x2); 1028*42f3dcd9SSrikanth Yalavarthi } 1029*42f3dcd9SSrikanth Yalavarthi 1030*42f3dcd9SSrikanth Yalavarthi static inline void 1031*42f3dcd9SSrikanth Yalavarthi __float32_to_uint64_neon_u64x1(float scale, float *input, uint64_t *output) 1032*42f3dcd9SSrikanth Yalavarthi { 1033*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 1034*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 1035*42f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 1036*42f3dcd9SSrikanth Yalavarthi 1037*42f3dcd9SSrikanth Yalavarthi /* load 1 x float element */ 1038*42f3dcd9SSrikanth Yalavarthi f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0); 1039*42f3dcd9SSrikanth Yalavarthi 1040*42f3dcd9SSrikanth Yalavarthi /* scale */ 1041*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 1042*42f3dcd9SSrikanth Yalavarthi 1043*42f3dcd9SSrikanth Yalavarthi /* convert to float64x2_t */ 1044*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvt_f64_f32(f32x2); 1045*42f3dcd9SSrikanth Yalavarthi 1046*42f3dcd9SSrikanth Yalavarthi /* convert to int64x2_t */ 1047*42f3dcd9SSrikanth Yalavarthi u64x2 = vcvtaq_u64_f64(f64x2); 1048*42f3dcd9SSrikanth Yalavarthi 1049*42f3dcd9SSrikanth Yalavarthi /* store 2 elements */ 1050*42f3dcd9SSrikanth Yalavarthi vst1q_lane_u64(output, u64x2, 0); 1051*42f3dcd9SSrikanth Yalavarthi } 1052*42f3dcd9SSrikanth Yalavarthi 1053*42f3dcd9SSrikanth Yalavarthi int 1054*42f3dcd9SSrikanth Yalavarthi rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, void *output) 1055*42f3dcd9SSrikanth Yalavarthi { 1056*42f3dcd9SSrikanth Yalavarthi float *input_buffer; 1057*42f3dcd9SSrikanth Yalavarthi uint64_t *output_buffer; 1058*42f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 1059*42f3dcd9SSrikanth Yalavarthi uint32_t vlen; 1060*42f3dcd9SSrikanth Yalavarthi uint64_t i; 1061*42f3dcd9SSrikanth Yalavarthi 1062*42f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1063*42f3dcd9SSrikanth Yalavarthi return -EINVAL; 1064*42f3dcd9SSrikanth Yalavarthi 1065*42f3dcd9SSrikanth Yalavarthi input_buffer = (float *)input; 1066*42f3dcd9SSrikanth Yalavarthi output_buffer = (uint64_t *)output; 1067*42f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(uint64_t); 1068*42f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 1069*42f3dcd9SSrikanth Yalavarthi 1070*42f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 1071*42f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1072*42f3dcd9SSrikanth Yalavarthi __float32_to_uint64_neon_u64x2(scale, input_buffer, output_buffer); 1073*42f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 1074*42f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 1075*42f3dcd9SSrikanth Yalavarthi } 1076*42f3dcd9SSrikanth Yalavarthi 1077*42f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 1078*42f3dcd9SSrikanth Yalavarthi i = i * vlen; 1079*42f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 1080*42f3dcd9SSrikanth Yalavarthi __float32_to_uint64_neon_u64x1(scale, input_buffer, output_buffer); 1081*42f3dcd9SSrikanth Yalavarthi input_buffer++; 1082*42f3dcd9SSrikanth Yalavarthi output_buffer++; 1083*42f3dcd9SSrikanth Yalavarthi } 1084*42f3dcd9SSrikanth Yalavarthi 1085*42f3dcd9SSrikanth Yalavarthi return 0; 1086*42f3dcd9SSrikanth Yalavarthi } 1087*42f3dcd9SSrikanth Yalavarthi 1088*42f3dcd9SSrikanth Yalavarthi static inline void 1089*42f3dcd9SSrikanth Yalavarthi __uint64_to_float32_neon_f32x2(float scale, uint64_t *input, float *output) 1090*42f3dcd9SSrikanth Yalavarthi { 1091*42f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 1092*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 1093*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 1094*42f3dcd9SSrikanth Yalavarthi 1095*42f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 1096*42f3dcd9SSrikanth Yalavarthi u64x2 = vld1q_u64(input); 1097*42f3dcd9SSrikanth Yalavarthi 1098*42f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 1099*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_u64(u64x2); 1100*42f3dcd9SSrikanth Yalavarthi 1101*42f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 1102*42f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 1103*42f3dcd9SSrikanth Yalavarthi 1104*42f3dcd9SSrikanth Yalavarthi /* scale */ 1105*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 1106*42f3dcd9SSrikanth Yalavarthi 1107*42f3dcd9SSrikanth Yalavarthi /* store float32x2_t */ 1108*42f3dcd9SSrikanth Yalavarthi vst1_f32(output, f32x2); 1109*42f3dcd9SSrikanth Yalavarthi } 1110*42f3dcd9SSrikanth Yalavarthi 1111*42f3dcd9SSrikanth Yalavarthi static inline void 1112*42f3dcd9SSrikanth Yalavarthi __uint64_to_float32_neon_f32x1(float scale, uint64_t *input, float *output) 1113*42f3dcd9SSrikanth Yalavarthi { 1114*42f3dcd9SSrikanth Yalavarthi uint64x2_t u64x2; 1115*42f3dcd9SSrikanth Yalavarthi float64x2_t f64x2; 1116*42f3dcd9SSrikanth Yalavarthi float32x2_t f32x2; 1117*42f3dcd9SSrikanth Yalavarthi 1118*42f3dcd9SSrikanth Yalavarthi /* load 2 x int64_t elements */ 1119*42f3dcd9SSrikanth Yalavarthi u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0); 1120*42f3dcd9SSrikanth Yalavarthi 1121*42f3dcd9SSrikanth Yalavarthi /* convert int64x2_t to float64x2_t */ 1122*42f3dcd9SSrikanth Yalavarthi f64x2 = vcvtq_f64_u64(u64x2); 1123*42f3dcd9SSrikanth Yalavarthi 1124*42f3dcd9SSrikanth Yalavarthi /* convert float64x2_t to float32x2_t */ 1125*42f3dcd9SSrikanth Yalavarthi f32x2 = vcvt_f32_f64(f64x2); 1126*42f3dcd9SSrikanth Yalavarthi 1127*42f3dcd9SSrikanth Yalavarthi /* scale */ 1128*42f3dcd9SSrikanth Yalavarthi f32x2 = vmul_n_f32(f32x2, scale); 1129*42f3dcd9SSrikanth Yalavarthi 1130*42f3dcd9SSrikanth Yalavarthi /* store float32x2_t */ 1131*42f3dcd9SSrikanth Yalavarthi vst1_lane_f32(output, f32x2, 0); 1132*42f3dcd9SSrikanth Yalavarthi } 1133*42f3dcd9SSrikanth Yalavarthi 1134*42f3dcd9SSrikanth Yalavarthi int 1135*42f3dcd9SSrikanth Yalavarthi rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 1136*42f3dcd9SSrikanth Yalavarthi { 1137*42f3dcd9SSrikanth Yalavarthi uint64_t *input_buffer; 1138*42f3dcd9SSrikanth Yalavarthi float *output_buffer; 1139*42f3dcd9SSrikanth Yalavarthi uint64_t nb_iterations; 1140*42f3dcd9SSrikanth Yalavarthi uint32_t vlen; 1141*42f3dcd9SSrikanth Yalavarthi uint64_t i; 1142*42f3dcd9SSrikanth Yalavarthi 1143*42f3dcd9SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 1144*42f3dcd9SSrikanth Yalavarthi return -EINVAL; 1145*42f3dcd9SSrikanth Yalavarthi 1146*42f3dcd9SSrikanth Yalavarthi input_buffer = (uint64_t *)input; 1147*42f3dcd9SSrikanth Yalavarthi output_buffer = (float *)output; 1148*42f3dcd9SSrikanth Yalavarthi vlen = 4 * sizeof(float) / sizeof(uint64_t); 1149*42f3dcd9SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 1150*42f3dcd9SSrikanth Yalavarthi 1151*42f3dcd9SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 1152*42f3dcd9SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1153*42f3dcd9SSrikanth Yalavarthi __uint64_to_float32_neon_f32x2(scale, input_buffer, output_buffer); 1154*42f3dcd9SSrikanth Yalavarthi input_buffer += vlen; 1155*42f3dcd9SSrikanth Yalavarthi output_buffer += vlen; 1156*42f3dcd9SSrikanth Yalavarthi } 1157*42f3dcd9SSrikanth Yalavarthi 1158*42f3dcd9SSrikanth Yalavarthi /* convert leftover elements */ 1159*42f3dcd9SSrikanth Yalavarthi i = i * vlen; 1160*42f3dcd9SSrikanth Yalavarthi for (; i < nb_elements; i++) { 1161*42f3dcd9SSrikanth Yalavarthi __uint64_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 1162*42f3dcd9SSrikanth Yalavarthi input_buffer++; 1163*42f3dcd9SSrikanth Yalavarthi output_buffer++; 1164*42f3dcd9SSrikanth Yalavarthi } 1165*42f3dcd9SSrikanth Yalavarthi 1166*42f3dcd9SSrikanth Yalavarthi return 0; 1167*42f3dcd9SSrikanth Yalavarthi } 1168*42f3dcd9SSrikanth Yalavarthi 1169*42f3dcd9SSrikanth Yalavarthi static inline void 1170fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output) 1171fc54766bSSrikanth Yalavarthi { 1172fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1173fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1174fc54766bSSrikanth Yalavarthi 1175fc54766bSSrikanth Yalavarthi /* load 4 x float32_t elements */ 1176fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 1177fc54766bSSrikanth Yalavarthi 1178fc54766bSSrikanth Yalavarthi /* convert to float16x4_t */ 1179fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 1180fc54766bSSrikanth Yalavarthi 1181fc54766bSSrikanth Yalavarthi /* store float16x4_t */ 1182fc54766bSSrikanth Yalavarthi vst1_f16(output, f16x4); 1183fc54766bSSrikanth Yalavarthi } 1184fc54766bSSrikanth Yalavarthi 1185fc54766bSSrikanth Yalavarthi static inline void 1186fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output) 1187fc54766bSSrikanth Yalavarthi { 1188fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1189fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1190fc54766bSSrikanth Yalavarthi 1191fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 1192fc54766bSSrikanth Yalavarthi f32x4 = vld1q_dup_f32(input); 1193fc54766bSSrikanth Yalavarthi 1194fc54766bSSrikanth Yalavarthi /* convert float32_t to float16_t */ 1195fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 1196fc54766bSSrikanth Yalavarthi 1197fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 1198fc54766bSSrikanth Yalavarthi vst1_lane_f16(output, f16x4, 0); 1199fc54766bSSrikanth Yalavarthi } 1200fc54766bSSrikanth Yalavarthi 1201fc54766bSSrikanth Yalavarthi int 1202fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output) 1203fc54766bSSrikanth Yalavarthi { 1204fc54766bSSrikanth Yalavarthi float32_t *input_buffer; 1205fc54766bSSrikanth Yalavarthi float16_t *output_buffer; 1206fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 1207fc54766bSSrikanth Yalavarthi uint32_t vlen; 1208fc54766bSSrikanth Yalavarthi uint64_t i; 1209fc54766bSSrikanth Yalavarthi 1210fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1211fc54766bSSrikanth Yalavarthi return -EINVAL; 1212fc54766bSSrikanth Yalavarthi 1213fc54766bSSrikanth Yalavarthi input_buffer = (float32_t *)input; 1214fc54766bSSrikanth Yalavarthi output_buffer = (float16_t *)output; 1215fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1216fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 1217fc54766bSSrikanth Yalavarthi 1218fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 1219fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1220fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 1221fc54766bSSrikanth Yalavarthi input_buffer += vlen; 1222fc54766bSSrikanth Yalavarthi output_buffer += vlen; 1223fc54766bSSrikanth Yalavarthi } 1224fc54766bSSrikanth Yalavarthi 1225fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 1226fc54766bSSrikanth Yalavarthi i = i * vlen; 1227fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 1228fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 1229fc54766bSSrikanth Yalavarthi input_buffer++; 1230fc54766bSSrikanth Yalavarthi output_buffer++; 1231fc54766bSSrikanth Yalavarthi } 1232fc54766bSSrikanth Yalavarthi 1233fc54766bSSrikanth Yalavarthi return 0; 1234fc54766bSSrikanth Yalavarthi } 1235fc54766bSSrikanth Yalavarthi 1236fc54766bSSrikanth Yalavarthi static inline void 1237fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output) 1238fc54766bSSrikanth Yalavarthi { 1239fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1240fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1241fc54766bSSrikanth Yalavarthi 1242fc54766bSSrikanth Yalavarthi /* load 4 x float16_t elements */ 1243fc54766bSSrikanth Yalavarthi f16x4 = vld1_f16(input); 1244fc54766bSSrikanth Yalavarthi 1245fc54766bSSrikanth Yalavarthi /* convert float16x4_t to float32x4_t */ 1246fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 1247fc54766bSSrikanth Yalavarthi 1248fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 1249fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 1250fc54766bSSrikanth Yalavarthi } 1251fc54766bSSrikanth Yalavarthi 1252fc54766bSSrikanth Yalavarthi static inline void 1253fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output) 1254fc54766bSSrikanth Yalavarthi { 1255fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 1256fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 1257fc54766bSSrikanth Yalavarthi 1258fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 1259fc54766bSSrikanth Yalavarthi f16x4 = vld1_dup_f16(input); 1260fc54766bSSrikanth Yalavarthi 1261fc54766bSSrikanth Yalavarthi /* convert float16_t to float32_t */ 1262fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 1263fc54766bSSrikanth Yalavarthi 1264fc54766bSSrikanth Yalavarthi /* store 1 element */ 1265fc54766bSSrikanth Yalavarthi vst1q_lane_f32(output, f32x4, 0); 1266fc54766bSSrikanth Yalavarthi } 1267fc54766bSSrikanth Yalavarthi 1268fc54766bSSrikanth Yalavarthi int 1269fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output) 1270fc54766bSSrikanth Yalavarthi { 1271fc54766bSSrikanth Yalavarthi float16_t *input_buffer; 1272fc54766bSSrikanth Yalavarthi float32_t *output_buffer; 1273fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 1274fc54766bSSrikanth Yalavarthi uint32_t vlen; 1275fc54766bSSrikanth Yalavarthi uint64_t i; 1276fc54766bSSrikanth Yalavarthi 1277fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 1278fc54766bSSrikanth Yalavarthi return -EINVAL; 1279fc54766bSSrikanth Yalavarthi 1280fc54766bSSrikanth Yalavarthi input_buffer = (float16_t *)input; 1281fc54766bSSrikanth Yalavarthi output_buffer = (float32_t *)output; 1282fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 1283fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 1284fc54766bSSrikanth Yalavarthi 1285fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 1286fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 1287fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 1288fc54766bSSrikanth Yalavarthi input_buffer += vlen; 1289fc54766bSSrikanth Yalavarthi output_buffer += vlen; 1290fc54766bSSrikanth Yalavarthi } 1291fc54766bSSrikanth Yalavarthi 1292fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 1293fc54766bSSrikanth Yalavarthi i = i * vlen; 1294fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 1295fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 1296fc54766bSSrikanth Yalavarthi input_buffer++; 1297fc54766bSSrikanth Yalavarthi output_buffer++; 1298fc54766bSSrikanth Yalavarthi } 1299fc54766bSSrikanth Yalavarthi 1300fc54766bSSrikanth Yalavarthi return 0; 1301fc54766bSSrikanth Yalavarthi } 1302