1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause 2fc54766bSSrikanth Yalavarthi * Copyright (c) 2022 Marvell. 3fc54766bSSrikanth Yalavarthi */ 4fc54766bSSrikanth Yalavarthi 5fc54766bSSrikanth Yalavarthi #include <errno.h> 6fc54766bSSrikanth Yalavarthi #include <stdint.h> 7fc54766bSSrikanth Yalavarthi #include <stdlib.h> 8fc54766bSSrikanth Yalavarthi 9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h" 10fc54766bSSrikanth Yalavarthi 11fc54766bSSrikanth Yalavarthi #include <arm_neon.h> 12fc54766bSSrikanth Yalavarthi 13fc54766bSSrikanth Yalavarthi /* Description: 14fc54766bSSrikanth Yalavarthi * This file implements vector versions of Machine Learning utility functions used to convert data 15538f6997SSrikanth Yalavarthi * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation 16538f6997SSrikanth Yalavarthi * is based on Arm Neon intrinsics. 17fc54766bSSrikanth Yalavarthi */ 18fc54766bSSrikanth Yalavarthi 19fc54766bSSrikanth Yalavarthi static inline void 20fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output) 21fc54766bSSrikanth Yalavarthi { 22fc54766bSSrikanth Yalavarthi int16x4_t s16x4_l; 23fc54766bSSrikanth Yalavarthi int16x4_t s16x4_h; 24fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 25fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 26fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 27fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 28fc54766bSSrikanth Yalavarthi 29fc54766bSSrikanth Yalavarthi /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 31fc54766bSSrikanth Yalavarthi */ 32fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 33fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 34fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 35fc54766bSSrikanth Yalavarthi s16x4_l = vqmovn_s32(s32x4); 36fc54766bSSrikanth Yalavarthi 37fc54766bSSrikanth Yalavarthi /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 38fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 39fc54766bSSrikanth Yalavarthi */ 40fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 41fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 42fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 43fc54766bSSrikanth Yalavarthi s16x4_h = vqmovn_s32(s32x4); 44fc54766bSSrikanth Yalavarthi 45fc54766bSSrikanth Yalavarthi /* combine lower and higher int16x4_t to int16x8_t */ 46fc54766bSSrikanth Yalavarthi s16x8 = vcombine_s16(s16x4_l, s16x4_h); 47fc54766bSSrikanth Yalavarthi 48fc54766bSSrikanth Yalavarthi /* narrow to int8_t */ 49fc54766bSSrikanth Yalavarthi s8x8 = vqmovn_s16(s16x8); 50fc54766bSSrikanth Yalavarthi 51fc54766bSSrikanth Yalavarthi /* store 8 elements */ 52fc54766bSSrikanth Yalavarthi vst1_s8(output, s8x8); 53fc54766bSSrikanth Yalavarthi } 54fc54766bSSrikanth Yalavarthi 55fc54766bSSrikanth Yalavarthi static inline void 56fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output) 57fc54766bSSrikanth Yalavarthi { 58fc54766bSSrikanth Yalavarthi int32_t s32; 59fc54766bSSrikanth Yalavarthi int16_t s16; 60fc54766bSSrikanth Yalavarthi 61fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 62fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 63fc54766bSSrikanth Yalavarthi 64fc54766bSSrikanth Yalavarthi /* saturate narrow */ 65fc54766bSSrikanth Yalavarthi s16 = vqmovns_s32(s32); 66fc54766bSSrikanth Yalavarthi 67fc54766bSSrikanth Yalavarthi /* convert to int8_t */ 68fc54766bSSrikanth Yalavarthi *output = vqmovnh_s16(s16); 69fc54766bSSrikanth Yalavarthi } 70fc54766bSSrikanth Yalavarthi 71fc54766bSSrikanth Yalavarthi int 72fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output) 73fc54766bSSrikanth Yalavarthi { 74fc54766bSSrikanth Yalavarthi float *input_buffer; 75fc54766bSSrikanth Yalavarthi int8_t *output_buffer; 76fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 77fc54766bSSrikanth Yalavarthi uint32_t vlen; 78fc54766bSSrikanth Yalavarthi uint64_t i; 79fc54766bSSrikanth Yalavarthi 80fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 81fc54766bSSrikanth Yalavarthi return -EINVAL; 82fc54766bSSrikanth Yalavarthi 83fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 84fc54766bSSrikanth Yalavarthi output_buffer = (int8_t *)output; 85fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 86fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 87fc54766bSSrikanth Yalavarthi 88fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 89fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 90fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer); 91fc54766bSSrikanth Yalavarthi input_buffer += vlen; 92fc54766bSSrikanth Yalavarthi output_buffer += vlen; 93fc54766bSSrikanth Yalavarthi } 94fc54766bSSrikanth Yalavarthi 95fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 96fc54766bSSrikanth Yalavarthi i = i * vlen; 97fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 98fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer); 99fc54766bSSrikanth Yalavarthi input_buffer++; 100fc54766bSSrikanth Yalavarthi output_buffer++; 101fc54766bSSrikanth Yalavarthi } 102fc54766bSSrikanth Yalavarthi 103fc54766bSSrikanth Yalavarthi return 0; 104fc54766bSSrikanth Yalavarthi } 105fc54766bSSrikanth Yalavarthi 106fc54766bSSrikanth Yalavarthi static inline void 107fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output) 108fc54766bSSrikanth Yalavarthi { 109fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 110fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 111fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 112fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 113fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 114fc54766bSSrikanth Yalavarthi 115fc54766bSSrikanth Yalavarthi /* load 8 x int8_t elements */ 116fc54766bSSrikanth Yalavarthi s8x8 = vld1_s8(input); 117fc54766bSSrikanth Yalavarthi 118fc54766bSSrikanth Yalavarthi /* widen int8_t to int16_t */ 119fc54766bSSrikanth Yalavarthi s16x8 = vmovl_s8(s8x8); 120fc54766bSSrikanth Yalavarthi 121fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 122fc54766bSSrikanth Yalavarthi s16x4 = vget_low_s16(s16x8); 123fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 124fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 125fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 126fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 127fc54766bSSrikanth Yalavarthi 128fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 129fc54766bSSrikanth Yalavarthi s16x4 = vget_high_s16(s16x8); 130fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 131fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 132fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 133fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 134fc54766bSSrikanth Yalavarthi } 135fc54766bSSrikanth Yalavarthi 136fc54766bSSrikanth Yalavarthi static inline void 137fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output) 138fc54766bSSrikanth Yalavarthi { 139fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 140fc54766bSSrikanth Yalavarthi } 141fc54766bSSrikanth Yalavarthi 142fc54766bSSrikanth Yalavarthi int 143fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 144fc54766bSSrikanth Yalavarthi { 145fc54766bSSrikanth Yalavarthi int8_t *input_buffer; 146fc54766bSSrikanth Yalavarthi float *output_buffer; 147fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 148fc54766bSSrikanth Yalavarthi uint32_t vlen; 149fc54766bSSrikanth Yalavarthi uint64_t i; 150fc54766bSSrikanth Yalavarthi 151fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 152fc54766bSSrikanth Yalavarthi return -EINVAL; 153fc54766bSSrikanth Yalavarthi 154fc54766bSSrikanth Yalavarthi input_buffer = (int8_t *)input; 155fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 156fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 157fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 158fc54766bSSrikanth Yalavarthi 159fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 160fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 161fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 162fc54766bSSrikanth Yalavarthi input_buffer += vlen; 163fc54766bSSrikanth Yalavarthi output_buffer += vlen; 164fc54766bSSrikanth Yalavarthi } 165fc54766bSSrikanth Yalavarthi 166fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 167fc54766bSSrikanth Yalavarthi i = i * vlen; 168fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 169fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 170fc54766bSSrikanth Yalavarthi input_buffer++; 171fc54766bSSrikanth Yalavarthi output_buffer++; 172fc54766bSSrikanth Yalavarthi } 173fc54766bSSrikanth Yalavarthi 174fc54766bSSrikanth Yalavarthi return 0; 175fc54766bSSrikanth Yalavarthi } 176fc54766bSSrikanth Yalavarthi 177fc54766bSSrikanth Yalavarthi static inline void 178fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output) 179fc54766bSSrikanth Yalavarthi { 180fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_l; 181fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_h; 182fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 183fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 184fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 185fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 186fc54766bSSrikanth Yalavarthi 187fc54766bSSrikanth Yalavarthi /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 188fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 189fc54766bSSrikanth Yalavarthi */ 190fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 191fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 192fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 193fc54766bSSrikanth Yalavarthi u16x4_l = vqmovn_u32(u32x4); 194fc54766bSSrikanth Yalavarthi 195fc54766bSSrikanth Yalavarthi /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 196fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 197fc54766bSSrikanth Yalavarthi */ 198fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 199fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 200fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 201fc54766bSSrikanth Yalavarthi u16x4_h = vqmovn_u32(u32x4); 202fc54766bSSrikanth Yalavarthi 203fc54766bSSrikanth Yalavarthi /* combine lower and higher uint16x4_t */ 204fc54766bSSrikanth Yalavarthi u16x8 = vcombine_u16(u16x4_l, u16x4_h); 205fc54766bSSrikanth Yalavarthi 206fc54766bSSrikanth Yalavarthi /* narrow to uint8x8_t */ 207fc54766bSSrikanth Yalavarthi u8x8 = vqmovn_u16(u16x8); 208fc54766bSSrikanth Yalavarthi 209fc54766bSSrikanth Yalavarthi /* store 8 elements */ 210fc54766bSSrikanth Yalavarthi vst1_u8(output, u8x8); 211fc54766bSSrikanth Yalavarthi } 212fc54766bSSrikanth Yalavarthi 213fc54766bSSrikanth Yalavarthi static inline void 214fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output) 215fc54766bSSrikanth Yalavarthi { 216fc54766bSSrikanth Yalavarthi uint32_t u32; 217fc54766bSSrikanth Yalavarthi uint16_t u16; 218fc54766bSSrikanth Yalavarthi 219fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 220fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 221fc54766bSSrikanth Yalavarthi 222fc54766bSSrikanth Yalavarthi /* saturate narrow */ 223fc54766bSSrikanth Yalavarthi u16 = vqmovns_u32(u32); 224fc54766bSSrikanth Yalavarthi 225fc54766bSSrikanth Yalavarthi /* convert to uint8_t */ 226fc54766bSSrikanth Yalavarthi *output = vqmovnh_u16(u16); 227fc54766bSSrikanth Yalavarthi } 228fc54766bSSrikanth Yalavarthi 229fc54766bSSrikanth Yalavarthi int 230fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output) 231fc54766bSSrikanth Yalavarthi { 232fc54766bSSrikanth Yalavarthi float *input_buffer; 233fc54766bSSrikanth Yalavarthi uint8_t *output_buffer; 234fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 235fc54766bSSrikanth Yalavarthi uint32_t vlen; 236fc54766bSSrikanth Yalavarthi uint64_t i; 237fc54766bSSrikanth Yalavarthi 238fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 239fc54766bSSrikanth Yalavarthi return -EINVAL; 240fc54766bSSrikanth Yalavarthi 241fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 242fc54766bSSrikanth Yalavarthi output_buffer = (uint8_t *)output; 243fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 244fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 245fc54766bSSrikanth Yalavarthi 246fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 247fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 248fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer); 249fc54766bSSrikanth Yalavarthi input_buffer += vlen; 250fc54766bSSrikanth Yalavarthi output_buffer += vlen; 251fc54766bSSrikanth Yalavarthi } 252fc54766bSSrikanth Yalavarthi 253fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 254fc54766bSSrikanth Yalavarthi i = i * vlen; 255fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 256fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer); 257fc54766bSSrikanth Yalavarthi input_buffer++; 258fc54766bSSrikanth Yalavarthi output_buffer++; 259fc54766bSSrikanth Yalavarthi } 260fc54766bSSrikanth Yalavarthi 261fc54766bSSrikanth Yalavarthi return 0; 262fc54766bSSrikanth Yalavarthi } 263fc54766bSSrikanth Yalavarthi 264fc54766bSSrikanth Yalavarthi static inline void 265fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output) 266fc54766bSSrikanth Yalavarthi { 267fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 268fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 269fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 270fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 271fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 272fc54766bSSrikanth Yalavarthi 273fc54766bSSrikanth Yalavarthi /* load 8 x uint8_t elements */ 274fc54766bSSrikanth Yalavarthi u8x8 = vld1_u8(input); 275fc54766bSSrikanth Yalavarthi 276fc54766bSSrikanth Yalavarthi /* widen uint8_t to uint16_t */ 277fc54766bSSrikanth Yalavarthi u16x8 = vmovl_u8(u8x8); 278fc54766bSSrikanth Yalavarthi 279fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 280fc54766bSSrikanth Yalavarthi u16x4 = vget_low_u16(u16x8); 281fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 282fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 283fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 284fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 285fc54766bSSrikanth Yalavarthi 286fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 287fc54766bSSrikanth Yalavarthi u16x4 = vget_high_u16(u16x8); 288fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 289fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 290fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 291fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 292fc54766bSSrikanth Yalavarthi } 293fc54766bSSrikanth Yalavarthi 294fc54766bSSrikanth Yalavarthi static inline void 295fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output) 296fc54766bSSrikanth Yalavarthi { 297fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 298fc54766bSSrikanth Yalavarthi } 299fc54766bSSrikanth Yalavarthi 300fc54766bSSrikanth Yalavarthi int 301fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 302fc54766bSSrikanth Yalavarthi { 303fc54766bSSrikanth Yalavarthi uint8_t *input_buffer; 304fc54766bSSrikanth Yalavarthi float *output_buffer; 305fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 306fc54766bSSrikanth Yalavarthi uint64_t vlen; 307fc54766bSSrikanth Yalavarthi uint64_t i; 308fc54766bSSrikanth Yalavarthi 309fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 310fc54766bSSrikanth Yalavarthi return -EINVAL; 311fc54766bSSrikanth Yalavarthi 312fc54766bSSrikanth Yalavarthi input_buffer = (uint8_t *)input; 313fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 314fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 315fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 316fc54766bSSrikanth Yalavarthi 317fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 318fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 319fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 320fc54766bSSrikanth Yalavarthi input_buffer += vlen; 321fc54766bSSrikanth Yalavarthi output_buffer += vlen; 322fc54766bSSrikanth Yalavarthi } 323fc54766bSSrikanth Yalavarthi 324fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 325fc54766bSSrikanth Yalavarthi i = i * vlen; 326fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 327fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 328fc54766bSSrikanth Yalavarthi input_buffer++; 329fc54766bSSrikanth Yalavarthi output_buffer++; 330fc54766bSSrikanth Yalavarthi } 331fc54766bSSrikanth Yalavarthi 332fc54766bSSrikanth Yalavarthi return 0; 333fc54766bSSrikanth Yalavarthi } 334fc54766bSSrikanth Yalavarthi 335fc54766bSSrikanth Yalavarthi static inline void 336fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output) 337fc54766bSSrikanth Yalavarthi { 338fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 339fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 340fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 341fc54766bSSrikanth Yalavarthi 342fc54766bSSrikanth Yalavarthi /* load 4 x float elements */ 343fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 344fc54766bSSrikanth Yalavarthi 345fc54766bSSrikanth Yalavarthi /* scale */ 346fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 347fc54766bSSrikanth Yalavarthi 348fc54766bSSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 349fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 350fc54766bSSrikanth Yalavarthi 351fc54766bSSrikanth Yalavarthi /* saturate narrow to int16x4_t */ 352fc54766bSSrikanth Yalavarthi s16x4 = vqmovn_s32(s32x4); 353fc54766bSSrikanth Yalavarthi 354fc54766bSSrikanth Yalavarthi /* store 4 elements */ 355fc54766bSSrikanth Yalavarthi vst1_s16(output, s16x4); 356fc54766bSSrikanth Yalavarthi } 357fc54766bSSrikanth Yalavarthi 358fc54766bSSrikanth Yalavarthi static inline void 359fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output) 360fc54766bSSrikanth Yalavarthi { 361fc54766bSSrikanth Yalavarthi int32_t s32; 362fc54766bSSrikanth Yalavarthi 363fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 364fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 365fc54766bSSrikanth Yalavarthi 366fc54766bSSrikanth Yalavarthi /* saturate narrow */ 367fc54766bSSrikanth Yalavarthi *output = vqmovns_s32(s32); 368fc54766bSSrikanth Yalavarthi } 369fc54766bSSrikanth Yalavarthi 370fc54766bSSrikanth Yalavarthi int 371fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output) 372fc54766bSSrikanth Yalavarthi { 373fc54766bSSrikanth Yalavarthi float *input_buffer; 374fc54766bSSrikanth Yalavarthi int16_t *output_buffer; 375fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 376fc54766bSSrikanth Yalavarthi uint32_t vlen; 377fc54766bSSrikanth Yalavarthi uint64_t i; 378fc54766bSSrikanth Yalavarthi 379fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 380fc54766bSSrikanth Yalavarthi return -EINVAL; 381fc54766bSSrikanth Yalavarthi 382fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 383fc54766bSSrikanth Yalavarthi output_buffer = (int16_t *)output; 384fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 385fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 386fc54766bSSrikanth Yalavarthi 387fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 388fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 389fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer); 390fc54766bSSrikanth Yalavarthi input_buffer += vlen; 391fc54766bSSrikanth Yalavarthi output_buffer += vlen; 392fc54766bSSrikanth Yalavarthi } 393fc54766bSSrikanth Yalavarthi 394fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 395fc54766bSSrikanth Yalavarthi i = i * vlen; 396fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 397fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer); 398fc54766bSSrikanth Yalavarthi input_buffer++; 399fc54766bSSrikanth Yalavarthi output_buffer++; 400fc54766bSSrikanth Yalavarthi } 401fc54766bSSrikanth Yalavarthi 402fc54766bSSrikanth Yalavarthi return 0; 403fc54766bSSrikanth Yalavarthi } 404fc54766bSSrikanth Yalavarthi 405fc54766bSSrikanth Yalavarthi static inline void 406fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output) 407fc54766bSSrikanth Yalavarthi { 408fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 409fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 410fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 411fc54766bSSrikanth Yalavarthi 412fc54766bSSrikanth Yalavarthi /* load 4 x int16_t elements */ 413fc54766bSSrikanth Yalavarthi s16x4 = vld1_s16(input); 414fc54766bSSrikanth Yalavarthi 415fc54766bSSrikanth Yalavarthi /* widen int16_t to int32_t */ 416fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 417fc54766bSSrikanth Yalavarthi 418fc54766bSSrikanth Yalavarthi /* convert int32_t to float */ 419fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 420fc54766bSSrikanth Yalavarthi 421fc54766bSSrikanth Yalavarthi /* scale */ 422fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 423fc54766bSSrikanth Yalavarthi 424fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 425fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 426fc54766bSSrikanth Yalavarthi } 427fc54766bSSrikanth Yalavarthi 428fc54766bSSrikanth Yalavarthi static inline void 429fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output) 430fc54766bSSrikanth Yalavarthi { 431fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 432fc54766bSSrikanth Yalavarthi } 433fc54766bSSrikanth Yalavarthi 434fc54766bSSrikanth Yalavarthi int 435fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 436fc54766bSSrikanth Yalavarthi { 437fc54766bSSrikanth Yalavarthi int16_t *input_buffer; 438fc54766bSSrikanth Yalavarthi float *output_buffer; 439fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 440fc54766bSSrikanth Yalavarthi uint32_t vlen; 441fc54766bSSrikanth Yalavarthi uint64_t i; 442fc54766bSSrikanth Yalavarthi 443fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 444fc54766bSSrikanth Yalavarthi return -EINVAL; 445fc54766bSSrikanth Yalavarthi 446fc54766bSSrikanth Yalavarthi input_buffer = (int16_t *)input; 447fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 448fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 449fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 450fc54766bSSrikanth Yalavarthi 451fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 452fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 453fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 454fc54766bSSrikanth Yalavarthi input_buffer += vlen; 455fc54766bSSrikanth Yalavarthi output_buffer += vlen; 456fc54766bSSrikanth Yalavarthi } 457fc54766bSSrikanth Yalavarthi 458fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 459fc54766bSSrikanth Yalavarthi i = i * vlen; 460fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 461fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 462fc54766bSSrikanth Yalavarthi input_buffer++; 463fc54766bSSrikanth Yalavarthi output_buffer++; 464fc54766bSSrikanth Yalavarthi } 465fc54766bSSrikanth Yalavarthi 466fc54766bSSrikanth Yalavarthi return 0; 467fc54766bSSrikanth Yalavarthi } 468fc54766bSSrikanth Yalavarthi 469fc54766bSSrikanth Yalavarthi static inline void 470fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output) 471fc54766bSSrikanth Yalavarthi { 472fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 473fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 474fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 475fc54766bSSrikanth Yalavarthi 476fc54766bSSrikanth Yalavarthi /* load 4 float elements */ 477fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 478fc54766bSSrikanth Yalavarthi 479fc54766bSSrikanth Yalavarthi /* scale */ 480fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 481fc54766bSSrikanth Yalavarthi 482fc54766bSSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 483fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 484fc54766bSSrikanth Yalavarthi 485fc54766bSSrikanth Yalavarthi /* saturate narrow */ 486fc54766bSSrikanth Yalavarthi u16x4 = vqmovn_u32(u32x4); 487fc54766bSSrikanth Yalavarthi 488fc54766bSSrikanth Yalavarthi /* store 4 elements */ 489fc54766bSSrikanth Yalavarthi vst1_u16(output, u16x4); 490fc54766bSSrikanth Yalavarthi } 491fc54766bSSrikanth Yalavarthi 492fc54766bSSrikanth Yalavarthi static inline void 493fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output) 494fc54766bSSrikanth Yalavarthi { 495fc54766bSSrikanth Yalavarthi uint32_t u32; 496fc54766bSSrikanth Yalavarthi 497fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 498fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 499fc54766bSSrikanth Yalavarthi 500fc54766bSSrikanth Yalavarthi /* saturate narrow */ 501fc54766bSSrikanth Yalavarthi *output = vqmovns_u32(u32); 502fc54766bSSrikanth Yalavarthi } 503fc54766bSSrikanth Yalavarthi 504fc54766bSSrikanth Yalavarthi int 505fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output) 506fc54766bSSrikanth Yalavarthi { 507fc54766bSSrikanth Yalavarthi float *input_buffer; 508fc54766bSSrikanth Yalavarthi uint16_t *output_buffer; 509fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 510fc54766bSSrikanth Yalavarthi uint64_t vlen; 511fc54766bSSrikanth Yalavarthi uint64_t i; 512fc54766bSSrikanth Yalavarthi 513fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 514fc54766bSSrikanth Yalavarthi return -EINVAL; 515fc54766bSSrikanth Yalavarthi 516fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 517fc54766bSSrikanth Yalavarthi output_buffer = (uint16_t *)output; 518fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 519fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 520fc54766bSSrikanth Yalavarthi 521fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 522fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 523fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer); 524fc54766bSSrikanth Yalavarthi input_buffer += vlen; 525fc54766bSSrikanth Yalavarthi output_buffer += vlen; 526fc54766bSSrikanth Yalavarthi } 527fc54766bSSrikanth Yalavarthi 528fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 529fc54766bSSrikanth Yalavarthi i = i * vlen; 530fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 531fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer); 532fc54766bSSrikanth Yalavarthi input_buffer++; 533fc54766bSSrikanth Yalavarthi output_buffer++; 534fc54766bSSrikanth Yalavarthi } 535fc54766bSSrikanth Yalavarthi 536fc54766bSSrikanth Yalavarthi return 0; 537fc54766bSSrikanth Yalavarthi } 538fc54766bSSrikanth Yalavarthi 539fc54766bSSrikanth Yalavarthi static inline void 540fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output) 541fc54766bSSrikanth Yalavarthi { 542fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 543fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 544fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 545fc54766bSSrikanth Yalavarthi 546fc54766bSSrikanth Yalavarthi /* load 4 x uint16_t elements */ 547fc54766bSSrikanth Yalavarthi u16x4 = vld1_u16(input); 548fc54766bSSrikanth Yalavarthi 549fc54766bSSrikanth Yalavarthi /* widen uint16_t to uint32_t */ 550fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 551fc54766bSSrikanth Yalavarthi 552fc54766bSSrikanth Yalavarthi /* convert uint32_t to float */ 553fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 554fc54766bSSrikanth Yalavarthi 555fc54766bSSrikanth Yalavarthi /* scale */ 556fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 557fc54766bSSrikanth Yalavarthi 558fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 559fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 560fc54766bSSrikanth Yalavarthi } 561fc54766bSSrikanth Yalavarthi 562fc54766bSSrikanth Yalavarthi static inline void 563fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output) 564fc54766bSSrikanth Yalavarthi { 565fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 566fc54766bSSrikanth Yalavarthi } 567fc54766bSSrikanth Yalavarthi 568fc54766bSSrikanth Yalavarthi int 569fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 570fc54766bSSrikanth Yalavarthi { 571fc54766bSSrikanth Yalavarthi uint16_t *input_buffer; 572fc54766bSSrikanth Yalavarthi float *output_buffer; 573fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 574fc54766bSSrikanth Yalavarthi uint32_t vlen; 575fc54766bSSrikanth Yalavarthi uint64_t i; 576fc54766bSSrikanth Yalavarthi 577fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 578fc54766bSSrikanth Yalavarthi return -EINVAL; 579fc54766bSSrikanth Yalavarthi 580fc54766bSSrikanth Yalavarthi input_buffer = (uint16_t *)input; 581fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 582fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 583fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 584fc54766bSSrikanth Yalavarthi 585fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 586fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 587fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 588fc54766bSSrikanth Yalavarthi input_buffer += vlen; 589fc54766bSSrikanth Yalavarthi output_buffer += vlen; 590fc54766bSSrikanth Yalavarthi } 591fc54766bSSrikanth Yalavarthi 592fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 593fc54766bSSrikanth Yalavarthi i = i * vlen; 594fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 595fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 596fc54766bSSrikanth Yalavarthi input_buffer++; 597fc54766bSSrikanth Yalavarthi output_buffer++; 598fc54766bSSrikanth Yalavarthi } 599fc54766bSSrikanth Yalavarthi 600fc54766bSSrikanth Yalavarthi return 0; 601fc54766bSSrikanth Yalavarthi } 602fc54766bSSrikanth Yalavarthi 603fc54766bSSrikanth Yalavarthi static inline void 604*50513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output) 605*50513ae5SSrikanth Yalavarthi { 606*50513ae5SSrikanth Yalavarthi float32x4_t f32x4; 607*50513ae5SSrikanth Yalavarthi int32x4_t s32x4; 608*50513ae5SSrikanth Yalavarthi 609*50513ae5SSrikanth Yalavarthi /* load 4 x float elements */ 610*50513ae5SSrikanth Yalavarthi f32x4 = vld1q_f32(input); 611*50513ae5SSrikanth Yalavarthi 612*50513ae5SSrikanth Yalavarthi /* scale */ 613*50513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 614*50513ae5SSrikanth Yalavarthi 615*50513ae5SSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 616*50513ae5SSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 617*50513ae5SSrikanth Yalavarthi 618*50513ae5SSrikanth Yalavarthi /* store 4 elements */ 619*50513ae5SSrikanth Yalavarthi vst1q_s32(output, s32x4); 620*50513ae5SSrikanth Yalavarthi } 621*50513ae5SSrikanth Yalavarthi 622*50513ae5SSrikanth Yalavarthi static inline void 623*50513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output) 624*50513ae5SSrikanth Yalavarthi { 625*50513ae5SSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 626*50513ae5SSrikanth Yalavarthi *output = vcvtas_s32_f32(scale * (*input)); 627*50513ae5SSrikanth Yalavarthi } 628*50513ae5SSrikanth Yalavarthi 629*50513ae5SSrikanth Yalavarthi int 630*50513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, void *output) 631*50513ae5SSrikanth Yalavarthi { 632*50513ae5SSrikanth Yalavarthi float *input_buffer; 633*50513ae5SSrikanth Yalavarthi int32_t *output_buffer; 634*50513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 635*50513ae5SSrikanth Yalavarthi uint32_t vlen; 636*50513ae5SSrikanth Yalavarthi uint64_t i; 637*50513ae5SSrikanth Yalavarthi 638*50513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 639*50513ae5SSrikanth Yalavarthi return -EINVAL; 640*50513ae5SSrikanth Yalavarthi 641*50513ae5SSrikanth Yalavarthi input_buffer = (float *)input; 642*50513ae5SSrikanth Yalavarthi output_buffer = (int32_t *)output; 643*50513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int32_t); 644*50513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 645*50513ae5SSrikanth Yalavarthi 646*50513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 647*50513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 648*50513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x4(scale, input_buffer, output_buffer); 649*50513ae5SSrikanth Yalavarthi input_buffer += vlen; 650*50513ae5SSrikanth Yalavarthi output_buffer += vlen; 651*50513ae5SSrikanth Yalavarthi } 652*50513ae5SSrikanth Yalavarthi 653*50513ae5SSrikanth Yalavarthi /* convert leftover elements */ 654*50513ae5SSrikanth Yalavarthi i = i * vlen; 655*50513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 656*50513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x1(scale, input_buffer, output_buffer); 657*50513ae5SSrikanth Yalavarthi input_buffer++; 658*50513ae5SSrikanth Yalavarthi output_buffer++; 659*50513ae5SSrikanth Yalavarthi } 660*50513ae5SSrikanth Yalavarthi 661*50513ae5SSrikanth Yalavarthi return 0; 662*50513ae5SSrikanth Yalavarthi } 663*50513ae5SSrikanth Yalavarthi 664*50513ae5SSrikanth Yalavarthi static inline void 665*50513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output) 666*50513ae5SSrikanth Yalavarthi { 667*50513ae5SSrikanth Yalavarthi float32x4_t f32x4; 668*50513ae5SSrikanth Yalavarthi int32x4_t s32x4; 669*50513ae5SSrikanth Yalavarthi 670*50513ae5SSrikanth Yalavarthi /* load 4 x int32_t elements */ 671*50513ae5SSrikanth Yalavarthi s32x4 = vld1q_s32(input); 672*50513ae5SSrikanth Yalavarthi 673*50513ae5SSrikanth Yalavarthi /* convert int32_t to float */ 674*50513ae5SSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 675*50513ae5SSrikanth Yalavarthi 676*50513ae5SSrikanth Yalavarthi /* scale */ 677*50513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 678*50513ae5SSrikanth Yalavarthi 679*50513ae5SSrikanth Yalavarthi /* store float32x4_t */ 680*50513ae5SSrikanth Yalavarthi vst1q_f32(output, f32x4); 681*50513ae5SSrikanth Yalavarthi } 682*50513ae5SSrikanth Yalavarthi 683*50513ae5SSrikanth Yalavarthi static inline void 684*50513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output) 685*50513ae5SSrikanth Yalavarthi { 686*50513ae5SSrikanth Yalavarthi *output = scale * vcvts_f32_s32(*input); 687*50513ae5SSrikanth Yalavarthi } 688*50513ae5SSrikanth Yalavarthi 689*50513ae5SSrikanth Yalavarthi int 690*50513ae5SSrikanth Yalavarthi rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 691*50513ae5SSrikanth Yalavarthi { 692*50513ae5SSrikanth Yalavarthi int32_t *input_buffer; 693*50513ae5SSrikanth Yalavarthi float *output_buffer; 694*50513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 695*50513ae5SSrikanth Yalavarthi uint32_t vlen; 696*50513ae5SSrikanth Yalavarthi uint64_t i; 697*50513ae5SSrikanth Yalavarthi 698*50513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 699*50513ae5SSrikanth Yalavarthi return -EINVAL; 700*50513ae5SSrikanth Yalavarthi 701*50513ae5SSrikanth Yalavarthi input_buffer = (int32_t *)input; 702*50513ae5SSrikanth Yalavarthi output_buffer = (float *)output; 703*50513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int32_t); 704*50513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 705*50513ae5SSrikanth Yalavarthi 706*50513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 707*50513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 708*50513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 709*50513ae5SSrikanth Yalavarthi input_buffer += vlen; 710*50513ae5SSrikanth Yalavarthi output_buffer += vlen; 711*50513ae5SSrikanth Yalavarthi } 712*50513ae5SSrikanth Yalavarthi 713*50513ae5SSrikanth Yalavarthi /* convert leftover elements */ 714*50513ae5SSrikanth Yalavarthi i = i * vlen; 715*50513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 716*50513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 717*50513ae5SSrikanth Yalavarthi input_buffer++; 718*50513ae5SSrikanth Yalavarthi output_buffer++; 719*50513ae5SSrikanth Yalavarthi } 720*50513ae5SSrikanth Yalavarthi 721*50513ae5SSrikanth Yalavarthi return 0; 722*50513ae5SSrikanth Yalavarthi } 723*50513ae5SSrikanth Yalavarthi 724*50513ae5SSrikanth Yalavarthi static inline void 725*50513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output) 726*50513ae5SSrikanth Yalavarthi { 727*50513ae5SSrikanth Yalavarthi float32x4_t f32x4; 728*50513ae5SSrikanth Yalavarthi uint32x4_t u32x4; 729*50513ae5SSrikanth Yalavarthi 730*50513ae5SSrikanth Yalavarthi /* load 4 float elements */ 731*50513ae5SSrikanth Yalavarthi f32x4 = vld1q_f32(input); 732*50513ae5SSrikanth Yalavarthi 733*50513ae5SSrikanth Yalavarthi /* scale */ 734*50513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 735*50513ae5SSrikanth Yalavarthi 736*50513ae5SSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 737*50513ae5SSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 738*50513ae5SSrikanth Yalavarthi 739*50513ae5SSrikanth Yalavarthi /* store 4 elements */ 740*50513ae5SSrikanth Yalavarthi vst1q_u32(output, u32x4); 741*50513ae5SSrikanth Yalavarthi } 742*50513ae5SSrikanth Yalavarthi 743*50513ae5SSrikanth Yalavarthi static inline void 744*50513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output) 745*50513ae5SSrikanth Yalavarthi { 746*50513ae5SSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 747*50513ae5SSrikanth Yalavarthi *output = vcvtas_u32_f32(scale * (*input)); 748*50513ae5SSrikanth Yalavarthi } 749*50513ae5SSrikanth Yalavarthi 750*50513ae5SSrikanth Yalavarthi int 751*50513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, void *output) 752*50513ae5SSrikanth Yalavarthi { 753*50513ae5SSrikanth Yalavarthi float *input_buffer; 754*50513ae5SSrikanth Yalavarthi uint32_t *output_buffer; 755*50513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 756*50513ae5SSrikanth Yalavarthi uint64_t vlen; 757*50513ae5SSrikanth Yalavarthi uint64_t i; 758*50513ae5SSrikanth Yalavarthi 759*50513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 760*50513ae5SSrikanth Yalavarthi return -EINVAL; 761*50513ae5SSrikanth Yalavarthi 762*50513ae5SSrikanth Yalavarthi input_buffer = (float *)input; 763*50513ae5SSrikanth Yalavarthi output_buffer = (uint32_t *)output; 764*50513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint32_t); 765*50513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 766*50513ae5SSrikanth Yalavarthi 767*50513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 768*50513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 769*50513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(scale, input_buffer, output_buffer); 770*50513ae5SSrikanth Yalavarthi input_buffer += vlen; 771*50513ae5SSrikanth Yalavarthi output_buffer += vlen; 772*50513ae5SSrikanth Yalavarthi } 773*50513ae5SSrikanth Yalavarthi 774*50513ae5SSrikanth Yalavarthi /* convert leftover elements */ 775*50513ae5SSrikanth Yalavarthi i = i * vlen; 776*50513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 777*50513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(scale, input_buffer, output_buffer); 778*50513ae5SSrikanth Yalavarthi input_buffer++; 779*50513ae5SSrikanth Yalavarthi output_buffer++; 780*50513ae5SSrikanth Yalavarthi } 781*50513ae5SSrikanth Yalavarthi 782*50513ae5SSrikanth Yalavarthi return 0; 783*50513ae5SSrikanth Yalavarthi } 784*50513ae5SSrikanth Yalavarthi 785*50513ae5SSrikanth Yalavarthi static inline void 786*50513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output) 787*50513ae5SSrikanth Yalavarthi { 788*50513ae5SSrikanth Yalavarthi float32x4_t f32x4; 789*50513ae5SSrikanth Yalavarthi uint32x4_t u32x4; 790*50513ae5SSrikanth Yalavarthi 791*50513ae5SSrikanth Yalavarthi /* load 4 x uint32_t elements */ 792*50513ae5SSrikanth Yalavarthi u32x4 = vld1q_u32(input); 793*50513ae5SSrikanth Yalavarthi 794*50513ae5SSrikanth Yalavarthi /* convert uint32_t to float */ 795*50513ae5SSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 796*50513ae5SSrikanth Yalavarthi 797*50513ae5SSrikanth Yalavarthi /* scale */ 798*50513ae5SSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 799*50513ae5SSrikanth Yalavarthi 800*50513ae5SSrikanth Yalavarthi /* store float32x4_t */ 801*50513ae5SSrikanth Yalavarthi vst1q_f32(output, f32x4); 802*50513ae5SSrikanth Yalavarthi } 803*50513ae5SSrikanth Yalavarthi 804*50513ae5SSrikanth Yalavarthi static inline void 805*50513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output) 806*50513ae5SSrikanth Yalavarthi { 807*50513ae5SSrikanth Yalavarthi *output = scale * vcvts_f32_u32(*input); 808*50513ae5SSrikanth Yalavarthi } 809*50513ae5SSrikanth Yalavarthi 810*50513ae5SSrikanth Yalavarthi int 811*50513ae5SSrikanth Yalavarthi rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 812*50513ae5SSrikanth Yalavarthi { 813*50513ae5SSrikanth Yalavarthi uint32_t *input_buffer; 814*50513ae5SSrikanth Yalavarthi float *output_buffer; 815*50513ae5SSrikanth Yalavarthi uint64_t nb_iterations; 816*50513ae5SSrikanth Yalavarthi uint32_t vlen; 817*50513ae5SSrikanth Yalavarthi uint64_t i; 818*50513ae5SSrikanth Yalavarthi 819*50513ae5SSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 820*50513ae5SSrikanth Yalavarthi return -EINVAL; 821*50513ae5SSrikanth Yalavarthi 822*50513ae5SSrikanth Yalavarthi input_buffer = (uint32_t *)input; 823*50513ae5SSrikanth Yalavarthi output_buffer = (float *)output; 824*50513ae5SSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint32_t); 825*50513ae5SSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 826*50513ae5SSrikanth Yalavarthi 827*50513ae5SSrikanth Yalavarthi /* convert vlen elements in each iteration */ 828*50513ae5SSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 829*50513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 830*50513ae5SSrikanth Yalavarthi input_buffer += vlen; 831*50513ae5SSrikanth Yalavarthi output_buffer += vlen; 832*50513ae5SSrikanth Yalavarthi } 833*50513ae5SSrikanth Yalavarthi 834*50513ae5SSrikanth Yalavarthi /* convert leftover elements */ 835*50513ae5SSrikanth Yalavarthi i = i * vlen; 836*50513ae5SSrikanth Yalavarthi for (; i < nb_elements; i++) { 837*50513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 838*50513ae5SSrikanth Yalavarthi input_buffer++; 839*50513ae5SSrikanth Yalavarthi output_buffer++; 840*50513ae5SSrikanth Yalavarthi } 841*50513ae5SSrikanth Yalavarthi 842*50513ae5SSrikanth Yalavarthi return 0; 843*50513ae5SSrikanth Yalavarthi } 844*50513ae5SSrikanth Yalavarthi 845*50513ae5SSrikanth Yalavarthi static inline void 846fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output) 847fc54766bSSrikanth Yalavarthi { 848fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 849fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 850fc54766bSSrikanth Yalavarthi 851fc54766bSSrikanth Yalavarthi /* load 4 x float32_t elements */ 852fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 853fc54766bSSrikanth Yalavarthi 854fc54766bSSrikanth Yalavarthi /* convert to float16x4_t */ 855fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 856fc54766bSSrikanth Yalavarthi 857fc54766bSSrikanth Yalavarthi /* store float16x4_t */ 858fc54766bSSrikanth Yalavarthi vst1_f16(output, f16x4); 859fc54766bSSrikanth Yalavarthi } 860fc54766bSSrikanth Yalavarthi 861fc54766bSSrikanth Yalavarthi static inline void 862fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output) 863fc54766bSSrikanth Yalavarthi { 864fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 865fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 866fc54766bSSrikanth Yalavarthi 867fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 868fc54766bSSrikanth Yalavarthi f32x4 = vld1q_dup_f32(input); 869fc54766bSSrikanth Yalavarthi 870fc54766bSSrikanth Yalavarthi /* convert float32_t to float16_t */ 871fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 872fc54766bSSrikanth Yalavarthi 873fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 874fc54766bSSrikanth Yalavarthi vst1_lane_f16(output, f16x4, 0); 875fc54766bSSrikanth Yalavarthi } 876fc54766bSSrikanth Yalavarthi 877fc54766bSSrikanth Yalavarthi int 878fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output) 879fc54766bSSrikanth Yalavarthi { 880fc54766bSSrikanth Yalavarthi float32_t *input_buffer; 881fc54766bSSrikanth Yalavarthi float16_t *output_buffer; 882fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 883fc54766bSSrikanth Yalavarthi uint32_t vlen; 884fc54766bSSrikanth Yalavarthi uint64_t i; 885fc54766bSSrikanth Yalavarthi 886fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 887fc54766bSSrikanth Yalavarthi return -EINVAL; 888fc54766bSSrikanth Yalavarthi 889fc54766bSSrikanth Yalavarthi input_buffer = (float32_t *)input; 890fc54766bSSrikanth Yalavarthi output_buffer = (float16_t *)output; 891fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 892fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 893fc54766bSSrikanth Yalavarthi 894fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 895fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 896fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 897fc54766bSSrikanth Yalavarthi input_buffer += vlen; 898fc54766bSSrikanth Yalavarthi output_buffer += vlen; 899fc54766bSSrikanth Yalavarthi } 900fc54766bSSrikanth Yalavarthi 901fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 902fc54766bSSrikanth Yalavarthi i = i * vlen; 903fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 904fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 905fc54766bSSrikanth Yalavarthi input_buffer++; 906fc54766bSSrikanth Yalavarthi output_buffer++; 907fc54766bSSrikanth Yalavarthi } 908fc54766bSSrikanth Yalavarthi 909fc54766bSSrikanth Yalavarthi return 0; 910fc54766bSSrikanth Yalavarthi } 911fc54766bSSrikanth Yalavarthi 912fc54766bSSrikanth Yalavarthi static inline void 913fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output) 914fc54766bSSrikanth Yalavarthi { 915fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 916fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 917fc54766bSSrikanth Yalavarthi 918fc54766bSSrikanth Yalavarthi /* load 4 x float16_t elements */ 919fc54766bSSrikanth Yalavarthi f16x4 = vld1_f16(input); 920fc54766bSSrikanth Yalavarthi 921fc54766bSSrikanth Yalavarthi /* convert float16x4_t to float32x4_t */ 922fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 923fc54766bSSrikanth Yalavarthi 924fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 925fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 926fc54766bSSrikanth Yalavarthi } 927fc54766bSSrikanth Yalavarthi 928fc54766bSSrikanth Yalavarthi static inline void 929fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output) 930fc54766bSSrikanth Yalavarthi { 931fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 932fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 933fc54766bSSrikanth Yalavarthi 934fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 935fc54766bSSrikanth Yalavarthi f16x4 = vld1_dup_f16(input); 936fc54766bSSrikanth Yalavarthi 937fc54766bSSrikanth Yalavarthi /* convert float16_t to float32_t */ 938fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 939fc54766bSSrikanth Yalavarthi 940fc54766bSSrikanth Yalavarthi /* store 1 element */ 941fc54766bSSrikanth Yalavarthi vst1q_lane_f32(output, f32x4, 0); 942fc54766bSSrikanth Yalavarthi } 943fc54766bSSrikanth Yalavarthi 944fc54766bSSrikanth Yalavarthi int 945fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output) 946fc54766bSSrikanth Yalavarthi { 947fc54766bSSrikanth Yalavarthi float16_t *input_buffer; 948fc54766bSSrikanth Yalavarthi float32_t *output_buffer; 949fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 950fc54766bSSrikanth Yalavarthi uint32_t vlen; 951fc54766bSSrikanth Yalavarthi uint64_t i; 952fc54766bSSrikanth Yalavarthi 953fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 954fc54766bSSrikanth Yalavarthi return -EINVAL; 955fc54766bSSrikanth Yalavarthi 956fc54766bSSrikanth Yalavarthi input_buffer = (float16_t *)input; 957fc54766bSSrikanth Yalavarthi output_buffer = (float32_t *)output; 958fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 959fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 960fc54766bSSrikanth Yalavarthi 961fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 962fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 963fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 964fc54766bSSrikanth Yalavarthi input_buffer += vlen; 965fc54766bSSrikanth Yalavarthi output_buffer += vlen; 966fc54766bSSrikanth Yalavarthi } 967fc54766bSSrikanth Yalavarthi 968fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 969fc54766bSSrikanth Yalavarthi i = i * vlen; 970fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 971fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 972fc54766bSSrikanth Yalavarthi input_buffer++; 973fc54766bSSrikanth Yalavarthi output_buffer++; 974fc54766bSSrikanth Yalavarthi } 975fc54766bSSrikanth Yalavarthi 976fc54766bSSrikanth Yalavarthi return 0; 977fc54766bSSrikanth Yalavarthi } 978