1*fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause 2*fc54766bSSrikanth Yalavarthi * Copyright (c) 2022 Marvell. 3*fc54766bSSrikanth Yalavarthi */ 4*fc54766bSSrikanth Yalavarthi 5*fc54766bSSrikanth Yalavarthi #include <errno.h> 6*fc54766bSSrikanth Yalavarthi #include <stdint.h> 7*fc54766bSSrikanth Yalavarthi #include <stdlib.h> 8*fc54766bSSrikanth Yalavarthi 9*fc54766bSSrikanth Yalavarthi #include "mldev_utils.h" 10*fc54766bSSrikanth Yalavarthi 11*fc54766bSSrikanth Yalavarthi #include <arm_neon.h> 12*fc54766bSSrikanth Yalavarthi 13*fc54766bSSrikanth Yalavarthi /* Description: 14*fc54766bSSrikanth Yalavarthi * This file implements vector versions of Machine Learning utility functions used to convert data 15*fc54766bSSrikanth Yalavarthi * types from higher precision to lower precision and vice-versa. Implementation is based on Arm 16*fc54766bSSrikanth Yalavarthi * Neon intrinsics. 17*fc54766bSSrikanth Yalavarthi */ 18*fc54766bSSrikanth Yalavarthi 19*fc54766bSSrikanth Yalavarthi static inline void 20*fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output) 21*fc54766bSSrikanth Yalavarthi { 22*fc54766bSSrikanth Yalavarthi int16x4_t s16x4_l; 23*fc54766bSSrikanth Yalavarthi int16x4_t s16x4_h; 24*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 25*fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 26*fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 27*fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 28*fc54766bSSrikanth Yalavarthi 29*fc54766bSSrikanth Yalavarthi /* load 4 float32 elements, scale, convert, saturate narrow to int16. 30*fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 31*fc54766bSSrikanth Yalavarthi */ 32*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 33*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 34*fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 35*fc54766bSSrikanth Yalavarthi s16x4_l = vqmovn_s32(s32x4); 36*fc54766bSSrikanth Yalavarthi 37*fc54766bSSrikanth Yalavarthi /* load next 4 float32 elements, scale, convert, saturate narrow to int16. 38*fc54766bSSrikanth Yalavarthi * Use round to nearest with ties away rounding mode. 39*fc54766bSSrikanth Yalavarthi */ 40*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 41*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 42*fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 43*fc54766bSSrikanth Yalavarthi s16x4_h = vqmovn_s32(s32x4); 44*fc54766bSSrikanth Yalavarthi 45*fc54766bSSrikanth Yalavarthi /* combine lower and higher int16x4_t to int16x8_t */ 46*fc54766bSSrikanth Yalavarthi s16x8 = vcombine_s16(s16x4_l, s16x4_h); 47*fc54766bSSrikanth Yalavarthi 48*fc54766bSSrikanth Yalavarthi /* narrow to int8_t */ 49*fc54766bSSrikanth Yalavarthi s8x8 = vqmovn_s16(s16x8); 50*fc54766bSSrikanth Yalavarthi 51*fc54766bSSrikanth Yalavarthi /* store 8 elements */ 52*fc54766bSSrikanth Yalavarthi vst1_s8(output, s8x8); 53*fc54766bSSrikanth Yalavarthi } 54*fc54766bSSrikanth Yalavarthi 55*fc54766bSSrikanth Yalavarthi static inline void 56*fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output) 57*fc54766bSSrikanth Yalavarthi { 58*fc54766bSSrikanth Yalavarthi int32_t s32; 59*fc54766bSSrikanth Yalavarthi int16_t s16; 60*fc54766bSSrikanth Yalavarthi 61*fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 62*fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 63*fc54766bSSrikanth Yalavarthi 64*fc54766bSSrikanth Yalavarthi /* saturate narrow */ 65*fc54766bSSrikanth Yalavarthi s16 = vqmovns_s32(s32); 66*fc54766bSSrikanth Yalavarthi 67*fc54766bSSrikanth Yalavarthi /* convert to int8_t */ 68*fc54766bSSrikanth Yalavarthi *output = vqmovnh_s16(s16); 69*fc54766bSSrikanth Yalavarthi } 70*fc54766bSSrikanth Yalavarthi 71*fc54766bSSrikanth Yalavarthi int 72*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output) 73*fc54766bSSrikanth Yalavarthi { 74*fc54766bSSrikanth Yalavarthi float *input_buffer; 75*fc54766bSSrikanth Yalavarthi int8_t *output_buffer; 76*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 77*fc54766bSSrikanth Yalavarthi uint32_t vlen; 78*fc54766bSSrikanth Yalavarthi uint64_t i; 79*fc54766bSSrikanth Yalavarthi 80*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 81*fc54766bSSrikanth Yalavarthi return -EINVAL; 82*fc54766bSSrikanth Yalavarthi 83*fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 84*fc54766bSSrikanth Yalavarthi output_buffer = (int8_t *)output; 85*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 86*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 87*fc54766bSSrikanth Yalavarthi 88*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 89*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 90*fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer); 91*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 92*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 93*fc54766bSSrikanth Yalavarthi } 94*fc54766bSSrikanth Yalavarthi 95*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 96*fc54766bSSrikanth Yalavarthi i = i * vlen; 97*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 98*fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer); 99*fc54766bSSrikanth Yalavarthi input_buffer++; 100*fc54766bSSrikanth Yalavarthi output_buffer++; 101*fc54766bSSrikanth Yalavarthi } 102*fc54766bSSrikanth Yalavarthi 103*fc54766bSSrikanth Yalavarthi return 0; 104*fc54766bSSrikanth Yalavarthi } 105*fc54766bSSrikanth Yalavarthi 106*fc54766bSSrikanth Yalavarthi static inline void 107*fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output) 108*fc54766bSSrikanth Yalavarthi { 109*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 110*fc54766bSSrikanth Yalavarthi int16x8_t s16x8; 111*fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 112*fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 113*fc54766bSSrikanth Yalavarthi int8x8_t s8x8; 114*fc54766bSSrikanth Yalavarthi 115*fc54766bSSrikanth Yalavarthi /* load 8 x int8_t elements */ 116*fc54766bSSrikanth Yalavarthi s8x8 = vld1_s8(input); 117*fc54766bSSrikanth Yalavarthi 118*fc54766bSSrikanth Yalavarthi /* widen int8_t to int16_t */ 119*fc54766bSSrikanth Yalavarthi s16x8 = vmovl_s8(s8x8); 120*fc54766bSSrikanth Yalavarthi 121*fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */ 122*fc54766bSSrikanth Yalavarthi s16x4 = vget_low_s16(s16x8); 123*fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 124*fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 125*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 126*fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 127*fc54766bSSrikanth Yalavarthi 128*fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */ 129*fc54766bSSrikanth Yalavarthi s16x4 = vget_high_s16(s16x8); 130*fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 131*fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 132*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 133*fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 134*fc54766bSSrikanth Yalavarthi } 135*fc54766bSSrikanth Yalavarthi 136*fc54766bSSrikanth Yalavarthi static inline void 137*fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output) 138*fc54766bSSrikanth Yalavarthi { 139*fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 140*fc54766bSSrikanth Yalavarthi } 141*fc54766bSSrikanth Yalavarthi 142*fc54766bSSrikanth Yalavarthi int 143*fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 144*fc54766bSSrikanth Yalavarthi { 145*fc54766bSSrikanth Yalavarthi int8_t *input_buffer; 146*fc54766bSSrikanth Yalavarthi float *output_buffer; 147*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 148*fc54766bSSrikanth Yalavarthi uint32_t vlen; 149*fc54766bSSrikanth Yalavarthi uint64_t i; 150*fc54766bSSrikanth Yalavarthi 151*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 152*fc54766bSSrikanth Yalavarthi return -EINVAL; 153*fc54766bSSrikanth Yalavarthi 154*fc54766bSSrikanth Yalavarthi input_buffer = (int8_t *)input; 155*fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 156*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int8_t); 157*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 158*fc54766bSSrikanth Yalavarthi 159*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 160*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 161*fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 162*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 163*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 164*fc54766bSSrikanth Yalavarthi } 165*fc54766bSSrikanth Yalavarthi 166*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 167*fc54766bSSrikanth Yalavarthi i = i * vlen; 168*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 169*fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 170*fc54766bSSrikanth Yalavarthi input_buffer++; 171*fc54766bSSrikanth Yalavarthi output_buffer++; 172*fc54766bSSrikanth Yalavarthi } 173*fc54766bSSrikanth Yalavarthi 174*fc54766bSSrikanth Yalavarthi return 0; 175*fc54766bSSrikanth Yalavarthi } 176*fc54766bSSrikanth Yalavarthi 177*fc54766bSSrikanth Yalavarthi static inline void 178*fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output) 179*fc54766bSSrikanth Yalavarthi { 180*fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_l; 181*fc54766bSSrikanth Yalavarthi uint16x4_t u16x4_h; 182*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 183*fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 184*fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 185*fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 186*fc54766bSSrikanth Yalavarthi 187*fc54766bSSrikanth Yalavarthi /* load 4 float elements, scale, convert, saturate narrow to uint16_t. 188*fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 189*fc54766bSSrikanth Yalavarthi */ 190*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 191*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 192*fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 193*fc54766bSSrikanth Yalavarthi u16x4_l = vqmovn_u32(u32x4); 194*fc54766bSSrikanth Yalavarthi 195*fc54766bSSrikanth Yalavarthi /* load next 4 float elements, scale, convert, saturate narrow to uint16_t 196*fc54766bSSrikanth Yalavarthi * use round to nearest with ties away rounding mode. 197*fc54766bSSrikanth Yalavarthi */ 198*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input + 4); 199*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 200*fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 201*fc54766bSSrikanth Yalavarthi u16x4_h = vqmovn_u32(u32x4); 202*fc54766bSSrikanth Yalavarthi 203*fc54766bSSrikanth Yalavarthi /* combine lower and higher uint16x4_t */ 204*fc54766bSSrikanth Yalavarthi u16x8 = vcombine_u16(u16x4_l, u16x4_h); 205*fc54766bSSrikanth Yalavarthi 206*fc54766bSSrikanth Yalavarthi /* narrow to uint8x8_t */ 207*fc54766bSSrikanth Yalavarthi u8x8 = vqmovn_u16(u16x8); 208*fc54766bSSrikanth Yalavarthi 209*fc54766bSSrikanth Yalavarthi /* store 8 elements */ 210*fc54766bSSrikanth Yalavarthi vst1_u8(output, u8x8); 211*fc54766bSSrikanth Yalavarthi } 212*fc54766bSSrikanth Yalavarthi 213*fc54766bSSrikanth Yalavarthi static inline void 214*fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output) 215*fc54766bSSrikanth Yalavarthi { 216*fc54766bSSrikanth Yalavarthi uint32_t u32; 217*fc54766bSSrikanth Yalavarthi uint16_t u16; 218*fc54766bSSrikanth Yalavarthi 219*fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 220*fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 221*fc54766bSSrikanth Yalavarthi 222*fc54766bSSrikanth Yalavarthi /* saturate narrow */ 223*fc54766bSSrikanth Yalavarthi u16 = vqmovns_u32(u32); 224*fc54766bSSrikanth Yalavarthi 225*fc54766bSSrikanth Yalavarthi /* convert to uint8_t */ 226*fc54766bSSrikanth Yalavarthi *output = vqmovnh_u16(u16); 227*fc54766bSSrikanth Yalavarthi } 228*fc54766bSSrikanth Yalavarthi 229*fc54766bSSrikanth Yalavarthi int 230*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output) 231*fc54766bSSrikanth Yalavarthi { 232*fc54766bSSrikanth Yalavarthi float *input_buffer; 233*fc54766bSSrikanth Yalavarthi uint8_t *output_buffer; 234*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 235*fc54766bSSrikanth Yalavarthi uint32_t vlen; 236*fc54766bSSrikanth Yalavarthi uint64_t i; 237*fc54766bSSrikanth Yalavarthi 238*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 239*fc54766bSSrikanth Yalavarthi return -EINVAL; 240*fc54766bSSrikanth Yalavarthi 241*fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 242*fc54766bSSrikanth Yalavarthi output_buffer = (uint8_t *)output; 243*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 244*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 245*fc54766bSSrikanth Yalavarthi 246*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 247*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 248*fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer); 249*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 250*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 251*fc54766bSSrikanth Yalavarthi } 252*fc54766bSSrikanth Yalavarthi 253*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 254*fc54766bSSrikanth Yalavarthi i = i * vlen; 255*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 256*fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer); 257*fc54766bSSrikanth Yalavarthi input_buffer++; 258*fc54766bSSrikanth Yalavarthi output_buffer++; 259*fc54766bSSrikanth Yalavarthi } 260*fc54766bSSrikanth Yalavarthi 261*fc54766bSSrikanth Yalavarthi return 0; 262*fc54766bSSrikanth Yalavarthi } 263*fc54766bSSrikanth Yalavarthi 264*fc54766bSSrikanth Yalavarthi static inline void 265*fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output) 266*fc54766bSSrikanth Yalavarthi { 267*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 268*fc54766bSSrikanth Yalavarthi uint16x8_t u16x8; 269*fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 270*fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 271*fc54766bSSrikanth Yalavarthi uint8x8_t u8x8; 272*fc54766bSSrikanth Yalavarthi 273*fc54766bSSrikanth Yalavarthi /* load 8 x uint8_t elements */ 274*fc54766bSSrikanth Yalavarthi u8x8 = vld1_u8(input); 275*fc54766bSSrikanth Yalavarthi 276*fc54766bSSrikanth Yalavarthi /* widen uint8_t to uint16_t */ 277*fc54766bSSrikanth Yalavarthi u16x8 = vmovl_u8(u8x8); 278*fc54766bSSrikanth Yalavarthi 279*fc54766bSSrikanth Yalavarthi /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */ 280*fc54766bSSrikanth Yalavarthi u16x4 = vget_low_u16(u16x8); 281*fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 282*fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 283*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 284*fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 285*fc54766bSSrikanth Yalavarthi 286*fc54766bSSrikanth Yalavarthi /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */ 287*fc54766bSSrikanth Yalavarthi u16x4 = vget_high_u16(u16x8); 288*fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 289*fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 290*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 291*fc54766bSSrikanth Yalavarthi vst1q_f32(output + 4, f32x4); 292*fc54766bSSrikanth Yalavarthi } 293*fc54766bSSrikanth Yalavarthi 294*fc54766bSSrikanth Yalavarthi static inline void 295*fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output) 296*fc54766bSSrikanth Yalavarthi { 297*fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 298*fc54766bSSrikanth Yalavarthi } 299*fc54766bSSrikanth Yalavarthi 300*fc54766bSSrikanth Yalavarthi int 301*fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 302*fc54766bSSrikanth Yalavarthi { 303*fc54766bSSrikanth Yalavarthi uint8_t *input_buffer; 304*fc54766bSSrikanth Yalavarthi float *output_buffer; 305*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 306*fc54766bSSrikanth Yalavarthi uint64_t vlen; 307*fc54766bSSrikanth Yalavarthi uint64_t i; 308*fc54766bSSrikanth Yalavarthi 309*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 310*fc54766bSSrikanth Yalavarthi return -EINVAL; 311*fc54766bSSrikanth Yalavarthi 312*fc54766bSSrikanth Yalavarthi input_buffer = (uint8_t *)input; 313*fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 314*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint8_t); 315*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 316*fc54766bSSrikanth Yalavarthi 317*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 318*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 319*fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer); 320*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 321*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 322*fc54766bSSrikanth Yalavarthi } 323*fc54766bSSrikanth Yalavarthi 324*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 325*fc54766bSSrikanth Yalavarthi i = i * vlen; 326*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 327*fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 328*fc54766bSSrikanth Yalavarthi input_buffer++; 329*fc54766bSSrikanth Yalavarthi output_buffer++; 330*fc54766bSSrikanth Yalavarthi } 331*fc54766bSSrikanth Yalavarthi 332*fc54766bSSrikanth Yalavarthi return 0; 333*fc54766bSSrikanth Yalavarthi } 334*fc54766bSSrikanth Yalavarthi 335*fc54766bSSrikanth Yalavarthi static inline void 336*fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output) 337*fc54766bSSrikanth Yalavarthi { 338*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 339*fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 340*fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 341*fc54766bSSrikanth Yalavarthi 342*fc54766bSSrikanth Yalavarthi /* load 4 x float elements */ 343*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 344*fc54766bSSrikanth Yalavarthi 345*fc54766bSSrikanth Yalavarthi /* scale */ 346*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 347*fc54766bSSrikanth Yalavarthi 348*fc54766bSSrikanth Yalavarthi /* convert to int32x4_t using round to nearest with ties away rounding mode */ 349*fc54766bSSrikanth Yalavarthi s32x4 = vcvtaq_s32_f32(f32x4); 350*fc54766bSSrikanth Yalavarthi 351*fc54766bSSrikanth Yalavarthi /* saturate narrow to int16x4_t */ 352*fc54766bSSrikanth Yalavarthi s16x4 = vqmovn_s32(s32x4); 353*fc54766bSSrikanth Yalavarthi 354*fc54766bSSrikanth Yalavarthi /* store 4 elements */ 355*fc54766bSSrikanth Yalavarthi vst1_s16(output, s16x4); 356*fc54766bSSrikanth Yalavarthi } 357*fc54766bSSrikanth Yalavarthi 358*fc54766bSSrikanth Yalavarthi static inline void 359*fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output) 360*fc54766bSSrikanth Yalavarthi { 361*fc54766bSSrikanth Yalavarthi int32_t s32; 362*fc54766bSSrikanth Yalavarthi 363*fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 364*fc54766bSSrikanth Yalavarthi s32 = vcvtas_s32_f32(scale * (*input)); 365*fc54766bSSrikanth Yalavarthi 366*fc54766bSSrikanth Yalavarthi /* saturate narrow */ 367*fc54766bSSrikanth Yalavarthi *output = vqmovns_s32(s32); 368*fc54766bSSrikanth Yalavarthi } 369*fc54766bSSrikanth Yalavarthi 370*fc54766bSSrikanth Yalavarthi int 371*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output) 372*fc54766bSSrikanth Yalavarthi { 373*fc54766bSSrikanth Yalavarthi float *input_buffer; 374*fc54766bSSrikanth Yalavarthi int16_t *output_buffer; 375*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 376*fc54766bSSrikanth Yalavarthi uint32_t vlen; 377*fc54766bSSrikanth Yalavarthi uint64_t i; 378*fc54766bSSrikanth Yalavarthi 379*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 380*fc54766bSSrikanth Yalavarthi return -EINVAL; 381*fc54766bSSrikanth Yalavarthi 382*fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 383*fc54766bSSrikanth Yalavarthi output_buffer = (int16_t *)output; 384*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 385*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 386*fc54766bSSrikanth Yalavarthi 387*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 388*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 389*fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer); 390*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 391*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 392*fc54766bSSrikanth Yalavarthi } 393*fc54766bSSrikanth Yalavarthi 394*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 395*fc54766bSSrikanth Yalavarthi i = i * vlen; 396*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 397*fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer); 398*fc54766bSSrikanth Yalavarthi input_buffer++; 399*fc54766bSSrikanth Yalavarthi output_buffer++; 400*fc54766bSSrikanth Yalavarthi } 401*fc54766bSSrikanth Yalavarthi 402*fc54766bSSrikanth Yalavarthi return 0; 403*fc54766bSSrikanth Yalavarthi } 404*fc54766bSSrikanth Yalavarthi 405*fc54766bSSrikanth Yalavarthi static inline void 406*fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output) 407*fc54766bSSrikanth Yalavarthi { 408*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 409*fc54766bSSrikanth Yalavarthi int16x4_t s16x4; 410*fc54766bSSrikanth Yalavarthi int32x4_t s32x4; 411*fc54766bSSrikanth Yalavarthi 412*fc54766bSSrikanth Yalavarthi /* load 4 x int16_t elements */ 413*fc54766bSSrikanth Yalavarthi s16x4 = vld1_s16(input); 414*fc54766bSSrikanth Yalavarthi 415*fc54766bSSrikanth Yalavarthi /* widen int16_t to int32_t */ 416*fc54766bSSrikanth Yalavarthi s32x4 = vmovl_s16(s16x4); 417*fc54766bSSrikanth Yalavarthi 418*fc54766bSSrikanth Yalavarthi /* convert int32_t to float */ 419*fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_s32(s32x4); 420*fc54766bSSrikanth Yalavarthi 421*fc54766bSSrikanth Yalavarthi /* scale */ 422*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 423*fc54766bSSrikanth Yalavarthi 424*fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 425*fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 426*fc54766bSSrikanth Yalavarthi } 427*fc54766bSSrikanth Yalavarthi 428*fc54766bSSrikanth Yalavarthi static inline void 429*fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output) 430*fc54766bSSrikanth Yalavarthi { 431*fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_s32((int32_t)*input); 432*fc54766bSSrikanth Yalavarthi } 433*fc54766bSSrikanth Yalavarthi 434*fc54766bSSrikanth Yalavarthi int 435*fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 436*fc54766bSSrikanth Yalavarthi { 437*fc54766bSSrikanth Yalavarthi int16_t *input_buffer; 438*fc54766bSSrikanth Yalavarthi float *output_buffer; 439*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 440*fc54766bSSrikanth Yalavarthi uint32_t vlen; 441*fc54766bSSrikanth Yalavarthi uint64_t i; 442*fc54766bSSrikanth Yalavarthi 443*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 444*fc54766bSSrikanth Yalavarthi return -EINVAL; 445*fc54766bSSrikanth Yalavarthi 446*fc54766bSSrikanth Yalavarthi input_buffer = (int16_t *)input; 447*fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 448*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(int16_t); 449*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 450*fc54766bSSrikanth Yalavarthi 451*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 452*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 453*fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 454*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 455*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 456*fc54766bSSrikanth Yalavarthi } 457*fc54766bSSrikanth Yalavarthi 458*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 459*fc54766bSSrikanth Yalavarthi i = i * vlen; 460*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 461*fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 462*fc54766bSSrikanth Yalavarthi input_buffer++; 463*fc54766bSSrikanth Yalavarthi output_buffer++; 464*fc54766bSSrikanth Yalavarthi } 465*fc54766bSSrikanth Yalavarthi 466*fc54766bSSrikanth Yalavarthi return 0; 467*fc54766bSSrikanth Yalavarthi } 468*fc54766bSSrikanth Yalavarthi 469*fc54766bSSrikanth Yalavarthi static inline void 470*fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output) 471*fc54766bSSrikanth Yalavarthi { 472*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 473*fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 474*fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 475*fc54766bSSrikanth Yalavarthi 476*fc54766bSSrikanth Yalavarthi /* load 4 float elements */ 477*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 478*fc54766bSSrikanth Yalavarthi 479*fc54766bSSrikanth Yalavarthi /* scale */ 480*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 481*fc54766bSSrikanth Yalavarthi 482*fc54766bSSrikanth Yalavarthi /* convert using round to nearest with ties to away rounding mode */ 483*fc54766bSSrikanth Yalavarthi u32x4 = vcvtaq_u32_f32(f32x4); 484*fc54766bSSrikanth Yalavarthi 485*fc54766bSSrikanth Yalavarthi /* saturate narrow */ 486*fc54766bSSrikanth Yalavarthi u16x4 = vqmovn_u32(u32x4); 487*fc54766bSSrikanth Yalavarthi 488*fc54766bSSrikanth Yalavarthi /* store 4 elements */ 489*fc54766bSSrikanth Yalavarthi vst1_u16(output, u16x4); 490*fc54766bSSrikanth Yalavarthi } 491*fc54766bSSrikanth Yalavarthi 492*fc54766bSSrikanth Yalavarthi static inline void 493*fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output) 494*fc54766bSSrikanth Yalavarthi { 495*fc54766bSSrikanth Yalavarthi uint32_t u32; 496*fc54766bSSrikanth Yalavarthi 497*fc54766bSSrikanth Yalavarthi /* scale and convert, round to nearest with ties away rounding mode */ 498*fc54766bSSrikanth Yalavarthi u32 = vcvtas_u32_f32(scale * (*input)); 499*fc54766bSSrikanth Yalavarthi 500*fc54766bSSrikanth Yalavarthi /* saturate narrow */ 501*fc54766bSSrikanth Yalavarthi *output = vqmovns_u32(u32); 502*fc54766bSSrikanth Yalavarthi } 503*fc54766bSSrikanth Yalavarthi 504*fc54766bSSrikanth Yalavarthi int 505*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output) 506*fc54766bSSrikanth Yalavarthi { 507*fc54766bSSrikanth Yalavarthi float *input_buffer; 508*fc54766bSSrikanth Yalavarthi uint16_t *output_buffer; 509*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 510*fc54766bSSrikanth Yalavarthi uint64_t vlen; 511*fc54766bSSrikanth Yalavarthi uint64_t i; 512*fc54766bSSrikanth Yalavarthi 513*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 514*fc54766bSSrikanth Yalavarthi return -EINVAL; 515*fc54766bSSrikanth Yalavarthi 516*fc54766bSSrikanth Yalavarthi input_buffer = (float *)input; 517*fc54766bSSrikanth Yalavarthi output_buffer = (uint16_t *)output; 518*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 519*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 520*fc54766bSSrikanth Yalavarthi 521*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 522*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 523*fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer); 524*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 525*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 526*fc54766bSSrikanth Yalavarthi } 527*fc54766bSSrikanth Yalavarthi 528*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 529*fc54766bSSrikanth Yalavarthi i = i * vlen; 530*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 531*fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer); 532*fc54766bSSrikanth Yalavarthi input_buffer++; 533*fc54766bSSrikanth Yalavarthi output_buffer++; 534*fc54766bSSrikanth Yalavarthi } 535*fc54766bSSrikanth Yalavarthi 536*fc54766bSSrikanth Yalavarthi return 0; 537*fc54766bSSrikanth Yalavarthi } 538*fc54766bSSrikanth Yalavarthi 539*fc54766bSSrikanth Yalavarthi static inline void 540*fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output) 541*fc54766bSSrikanth Yalavarthi { 542*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 543*fc54766bSSrikanth Yalavarthi uint16x4_t u16x4; 544*fc54766bSSrikanth Yalavarthi uint32x4_t u32x4; 545*fc54766bSSrikanth Yalavarthi 546*fc54766bSSrikanth Yalavarthi /* load 4 x uint16_t elements */ 547*fc54766bSSrikanth Yalavarthi u16x4 = vld1_u16(input); 548*fc54766bSSrikanth Yalavarthi 549*fc54766bSSrikanth Yalavarthi /* widen uint16_t to uint32_t */ 550*fc54766bSSrikanth Yalavarthi u32x4 = vmovl_u16(u16x4); 551*fc54766bSSrikanth Yalavarthi 552*fc54766bSSrikanth Yalavarthi /* convert uint32_t to float */ 553*fc54766bSSrikanth Yalavarthi f32x4 = vcvtq_f32_u32(u32x4); 554*fc54766bSSrikanth Yalavarthi 555*fc54766bSSrikanth Yalavarthi /* scale */ 556*fc54766bSSrikanth Yalavarthi f32x4 = vmulq_n_f32(f32x4, scale); 557*fc54766bSSrikanth Yalavarthi 558*fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 559*fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 560*fc54766bSSrikanth Yalavarthi } 561*fc54766bSSrikanth Yalavarthi 562*fc54766bSSrikanth Yalavarthi static inline void 563*fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output) 564*fc54766bSSrikanth Yalavarthi { 565*fc54766bSSrikanth Yalavarthi *output = scale * vcvts_f32_u32((uint32_t)*input); 566*fc54766bSSrikanth Yalavarthi } 567*fc54766bSSrikanth Yalavarthi 568*fc54766bSSrikanth Yalavarthi int 569*fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output) 570*fc54766bSSrikanth Yalavarthi { 571*fc54766bSSrikanth Yalavarthi uint16_t *input_buffer; 572*fc54766bSSrikanth Yalavarthi float *output_buffer; 573*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 574*fc54766bSSrikanth Yalavarthi uint32_t vlen; 575*fc54766bSSrikanth Yalavarthi uint64_t i; 576*fc54766bSSrikanth Yalavarthi 577*fc54766bSSrikanth Yalavarthi if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL)) 578*fc54766bSSrikanth Yalavarthi return -EINVAL; 579*fc54766bSSrikanth Yalavarthi 580*fc54766bSSrikanth Yalavarthi input_buffer = (uint16_t *)input; 581*fc54766bSSrikanth Yalavarthi output_buffer = (float *)output; 582*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float) / sizeof(uint16_t); 583*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 584*fc54766bSSrikanth Yalavarthi 585*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 586*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 587*fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer); 588*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 589*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 590*fc54766bSSrikanth Yalavarthi } 591*fc54766bSSrikanth Yalavarthi 592*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 593*fc54766bSSrikanth Yalavarthi i = i * vlen; 594*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 595*fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer); 596*fc54766bSSrikanth Yalavarthi input_buffer++; 597*fc54766bSSrikanth Yalavarthi output_buffer++; 598*fc54766bSSrikanth Yalavarthi } 599*fc54766bSSrikanth Yalavarthi 600*fc54766bSSrikanth Yalavarthi return 0; 601*fc54766bSSrikanth Yalavarthi } 602*fc54766bSSrikanth Yalavarthi 603*fc54766bSSrikanth Yalavarthi static inline void 604*fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output) 605*fc54766bSSrikanth Yalavarthi { 606*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 607*fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 608*fc54766bSSrikanth Yalavarthi 609*fc54766bSSrikanth Yalavarthi /* load 4 x float32_t elements */ 610*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 611*fc54766bSSrikanth Yalavarthi 612*fc54766bSSrikanth Yalavarthi /* convert to float16x4_t */ 613*fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 614*fc54766bSSrikanth Yalavarthi 615*fc54766bSSrikanth Yalavarthi /* store float16x4_t */ 616*fc54766bSSrikanth Yalavarthi vst1_f16(output, f16x4); 617*fc54766bSSrikanth Yalavarthi } 618*fc54766bSSrikanth Yalavarthi 619*fc54766bSSrikanth Yalavarthi static inline void 620*fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output) 621*fc54766bSSrikanth Yalavarthi { 622*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 623*fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 624*fc54766bSSrikanth Yalavarthi 625*fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 626*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_dup_f32(input); 627*fc54766bSSrikanth Yalavarthi 628*fc54766bSSrikanth Yalavarthi /* convert float32_t to float16_t */ 629*fc54766bSSrikanth Yalavarthi f16x4 = vcvt_f16_f32(f32x4); 630*fc54766bSSrikanth Yalavarthi 631*fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 632*fc54766bSSrikanth Yalavarthi vst1_lane_f16(output, f16x4, 0); 633*fc54766bSSrikanth Yalavarthi } 634*fc54766bSSrikanth Yalavarthi 635*fc54766bSSrikanth Yalavarthi int 636*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output) 637*fc54766bSSrikanth Yalavarthi { 638*fc54766bSSrikanth Yalavarthi float32_t *input_buffer; 639*fc54766bSSrikanth Yalavarthi float16_t *output_buffer; 640*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 641*fc54766bSSrikanth Yalavarthi uint32_t vlen; 642*fc54766bSSrikanth Yalavarthi uint64_t i; 643*fc54766bSSrikanth Yalavarthi 644*fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 645*fc54766bSSrikanth Yalavarthi return -EINVAL; 646*fc54766bSSrikanth Yalavarthi 647*fc54766bSSrikanth Yalavarthi input_buffer = (float32_t *)input; 648*fc54766bSSrikanth Yalavarthi output_buffer = (float16_t *)output; 649*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 650*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 651*fc54766bSSrikanth Yalavarthi 652*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 653*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 654*fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(input_buffer, output_buffer); 655*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 656*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 657*fc54766bSSrikanth Yalavarthi } 658*fc54766bSSrikanth Yalavarthi 659*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 660*fc54766bSSrikanth Yalavarthi i = i * vlen; 661*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 662*fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(input_buffer, output_buffer); 663*fc54766bSSrikanth Yalavarthi input_buffer++; 664*fc54766bSSrikanth Yalavarthi output_buffer++; 665*fc54766bSSrikanth Yalavarthi } 666*fc54766bSSrikanth Yalavarthi 667*fc54766bSSrikanth Yalavarthi return 0; 668*fc54766bSSrikanth Yalavarthi } 669*fc54766bSSrikanth Yalavarthi 670*fc54766bSSrikanth Yalavarthi static inline void 671*fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output) 672*fc54766bSSrikanth Yalavarthi { 673*fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 674*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 675*fc54766bSSrikanth Yalavarthi 676*fc54766bSSrikanth Yalavarthi /* load 4 x float16_t elements */ 677*fc54766bSSrikanth Yalavarthi f16x4 = vld1_f16(input); 678*fc54766bSSrikanth Yalavarthi 679*fc54766bSSrikanth Yalavarthi /* convert float16x4_t to float32x4_t */ 680*fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 681*fc54766bSSrikanth Yalavarthi 682*fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 683*fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 684*fc54766bSSrikanth Yalavarthi } 685*fc54766bSSrikanth Yalavarthi 686*fc54766bSSrikanth Yalavarthi static inline void 687*fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output) 688*fc54766bSSrikanth Yalavarthi { 689*fc54766bSSrikanth Yalavarthi float16x4_t f16x4; 690*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 691*fc54766bSSrikanth Yalavarthi 692*fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 693*fc54766bSSrikanth Yalavarthi f16x4 = vld1_dup_f16(input); 694*fc54766bSSrikanth Yalavarthi 695*fc54766bSSrikanth Yalavarthi /* convert float16_t to float32_t */ 696*fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_f16(f16x4); 697*fc54766bSSrikanth Yalavarthi 698*fc54766bSSrikanth Yalavarthi /* store 1 element */ 699*fc54766bSSrikanth Yalavarthi vst1q_lane_f32(output, f32x4, 0); 700*fc54766bSSrikanth Yalavarthi } 701*fc54766bSSrikanth Yalavarthi 702*fc54766bSSrikanth Yalavarthi int 703*fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output) 704*fc54766bSSrikanth Yalavarthi { 705*fc54766bSSrikanth Yalavarthi float16_t *input_buffer; 706*fc54766bSSrikanth Yalavarthi float32_t *output_buffer; 707*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 708*fc54766bSSrikanth Yalavarthi uint32_t vlen; 709*fc54766bSSrikanth Yalavarthi uint64_t i; 710*fc54766bSSrikanth Yalavarthi 711*fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 712*fc54766bSSrikanth Yalavarthi return -EINVAL; 713*fc54766bSSrikanth Yalavarthi 714*fc54766bSSrikanth Yalavarthi input_buffer = (float16_t *)input; 715*fc54766bSSrikanth Yalavarthi output_buffer = (float32_t *)output; 716*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(float16_t); 717*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 718*fc54766bSSrikanth Yalavarthi 719*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 720*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 721*fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(input_buffer, output_buffer); 722*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 723*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 724*fc54766bSSrikanth Yalavarthi } 725*fc54766bSSrikanth Yalavarthi 726*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 727*fc54766bSSrikanth Yalavarthi i = i * vlen; 728*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 729*fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(input_buffer, output_buffer); 730*fc54766bSSrikanth Yalavarthi input_buffer++; 731*fc54766bSSrikanth Yalavarthi output_buffer++; 732*fc54766bSSrikanth Yalavarthi } 733*fc54766bSSrikanth Yalavarthi 734*fc54766bSSrikanth Yalavarthi return 0; 735*fc54766bSSrikanth Yalavarthi } 736*fc54766bSSrikanth Yalavarthi 737*fc54766bSSrikanth Yalavarthi #ifdef __ARM_FEATURE_BF16 738*fc54766bSSrikanth Yalavarthi 739*fc54766bSSrikanth Yalavarthi static inline void 740*fc54766bSSrikanth Yalavarthi __float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output) 741*fc54766bSSrikanth Yalavarthi { 742*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 743*fc54766bSSrikanth Yalavarthi bfloat16x4_t bf16x4; 744*fc54766bSSrikanth Yalavarthi 745*fc54766bSSrikanth Yalavarthi /* load 4 x float32_t elements */ 746*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_f32(input); 747*fc54766bSSrikanth Yalavarthi 748*fc54766bSSrikanth Yalavarthi /* convert float32x4_t to bfloat16x4_t */ 749*fc54766bSSrikanth Yalavarthi bf16x4 = vcvt_bf16_f32(f32x4); 750*fc54766bSSrikanth Yalavarthi 751*fc54766bSSrikanth Yalavarthi /* store bfloat16x4_t */ 752*fc54766bSSrikanth Yalavarthi vst1_bf16(output, bf16x4); 753*fc54766bSSrikanth Yalavarthi } 754*fc54766bSSrikanth Yalavarthi 755*fc54766bSSrikanth Yalavarthi static inline void 756*fc54766bSSrikanth Yalavarthi __float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output) 757*fc54766bSSrikanth Yalavarthi { 758*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 759*fc54766bSSrikanth Yalavarthi bfloat16x4_t bf16x4; 760*fc54766bSSrikanth Yalavarthi 761*fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 762*fc54766bSSrikanth Yalavarthi f32x4 = vld1q_dup_f32(input); 763*fc54766bSSrikanth Yalavarthi 764*fc54766bSSrikanth Yalavarthi /* convert float32_t to bfloat16_t */ 765*fc54766bSSrikanth Yalavarthi bf16x4 = vcvt_bf16_f32(f32x4); 766*fc54766bSSrikanth Yalavarthi 767*fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 768*fc54766bSSrikanth Yalavarthi vst1_lane_bf16(output, bf16x4, 0); 769*fc54766bSSrikanth Yalavarthi } 770*fc54766bSSrikanth Yalavarthi 771*fc54766bSSrikanth Yalavarthi int 772*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output) 773*fc54766bSSrikanth Yalavarthi { 774*fc54766bSSrikanth Yalavarthi float32_t *input_buffer; 775*fc54766bSSrikanth Yalavarthi bfloat16_t *output_buffer; 776*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 777*fc54766bSSrikanth Yalavarthi uint32_t vlen; 778*fc54766bSSrikanth Yalavarthi uint64_t i; 779*fc54766bSSrikanth Yalavarthi 780*fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 781*fc54766bSSrikanth Yalavarthi return -EINVAL; 782*fc54766bSSrikanth Yalavarthi 783*fc54766bSSrikanth Yalavarthi input_buffer = (float32_t *)input; 784*fc54766bSSrikanth Yalavarthi output_buffer = (bfloat16_t *)output; 785*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t); 786*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 787*fc54766bSSrikanth Yalavarthi 788*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 789*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 790*fc54766bSSrikanth Yalavarthi __float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer); 791*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 792*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 793*fc54766bSSrikanth Yalavarthi } 794*fc54766bSSrikanth Yalavarthi 795*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 796*fc54766bSSrikanth Yalavarthi i = i * vlen; 797*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 798*fc54766bSSrikanth Yalavarthi __float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer); 799*fc54766bSSrikanth Yalavarthi input_buffer++; 800*fc54766bSSrikanth Yalavarthi output_buffer++; 801*fc54766bSSrikanth Yalavarthi } 802*fc54766bSSrikanth Yalavarthi 803*fc54766bSSrikanth Yalavarthi return 0; 804*fc54766bSSrikanth Yalavarthi } 805*fc54766bSSrikanth Yalavarthi 806*fc54766bSSrikanth Yalavarthi static inline void 807*fc54766bSSrikanth Yalavarthi __bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output) 808*fc54766bSSrikanth Yalavarthi { 809*fc54766bSSrikanth Yalavarthi bfloat16x4_t bf16x4; 810*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 811*fc54766bSSrikanth Yalavarthi 812*fc54766bSSrikanth Yalavarthi /* load 4 x bfloat16_t elements */ 813*fc54766bSSrikanth Yalavarthi bf16x4 = vld1_bf16(input); 814*fc54766bSSrikanth Yalavarthi 815*fc54766bSSrikanth Yalavarthi /* convert bfloat16x4_t to float32x4_t */ 816*fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_bf16(bf16x4); 817*fc54766bSSrikanth Yalavarthi 818*fc54766bSSrikanth Yalavarthi /* store float32x4_t */ 819*fc54766bSSrikanth Yalavarthi vst1q_f32(output, f32x4); 820*fc54766bSSrikanth Yalavarthi } 821*fc54766bSSrikanth Yalavarthi 822*fc54766bSSrikanth Yalavarthi static inline void 823*fc54766bSSrikanth Yalavarthi __bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output) 824*fc54766bSSrikanth Yalavarthi { 825*fc54766bSSrikanth Yalavarthi bfloat16x4_t bf16x4; 826*fc54766bSSrikanth Yalavarthi float32x4_t f32x4; 827*fc54766bSSrikanth Yalavarthi 828*fc54766bSSrikanth Yalavarthi /* load element to 4 lanes */ 829*fc54766bSSrikanth Yalavarthi bf16x4 = vld1_dup_bf16(input); 830*fc54766bSSrikanth Yalavarthi 831*fc54766bSSrikanth Yalavarthi /* convert bfloat16_t to float32_t */ 832*fc54766bSSrikanth Yalavarthi f32x4 = vcvt_f32_bf16(bf16x4); 833*fc54766bSSrikanth Yalavarthi 834*fc54766bSSrikanth Yalavarthi /* store lane 0 / 1 element */ 835*fc54766bSSrikanth Yalavarthi vst1q_lane_f32(output, f32x4, 0); 836*fc54766bSSrikanth Yalavarthi } 837*fc54766bSSrikanth Yalavarthi 838*fc54766bSSrikanth Yalavarthi int 839*fc54766bSSrikanth Yalavarthi rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output) 840*fc54766bSSrikanth Yalavarthi { 841*fc54766bSSrikanth Yalavarthi bfloat16_t *input_buffer; 842*fc54766bSSrikanth Yalavarthi float32_t *output_buffer; 843*fc54766bSSrikanth Yalavarthi uint64_t nb_iterations; 844*fc54766bSSrikanth Yalavarthi uint32_t vlen; 845*fc54766bSSrikanth Yalavarthi uint64_t i; 846*fc54766bSSrikanth Yalavarthi 847*fc54766bSSrikanth Yalavarthi if ((nb_elements == 0) || (input == NULL) || (output == NULL)) 848*fc54766bSSrikanth Yalavarthi return -EINVAL; 849*fc54766bSSrikanth Yalavarthi 850*fc54766bSSrikanth Yalavarthi input_buffer = (bfloat16_t *)input; 851*fc54766bSSrikanth Yalavarthi output_buffer = (float32_t *)output; 852*fc54766bSSrikanth Yalavarthi vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t); 853*fc54766bSSrikanth Yalavarthi nb_iterations = nb_elements / vlen; 854*fc54766bSSrikanth Yalavarthi 855*fc54766bSSrikanth Yalavarthi /* convert vlen elements in each iteration */ 856*fc54766bSSrikanth Yalavarthi for (i = 0; i < nb_iterations; i++) { 857*fc54766bSSrikanth Yalavarthi __bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer); 858*fc54766bSSrikanth Yalavarthi input_buffer += vlen; 859*fc54766bSSrikanth Yalavarthi output_buffer += vlen; 860*fc54766bSSrikanth Yalavarthi } 861*fc54766bSSrikanth Yalavarthi 862*fc54766bSSrikanth Yalavarthi /* convert leftover elements */ 863*fc54766bSSrikanth Yalavarthi i = i * vlen; 864*fc54766bSSrikanth Yalavarthi for (; i < nb_elements; i++) { 865*fc54766bSSrikanth Yalavarthi __bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer); 866*fc54766bSSrikanth Yalavarthi input_buffer++; 867*fc54766bSSrikanth Yalavarthi output_buffer++; 868*fc54766bSSrikanth Yalavarthi } 869*fc54766bSSrikanth Yalavarthi 870*fc54766bSSrikanth Yalavarthi return 0; 871*fc54766bSSrikanth Yalavarthi } 872*fc54766bSSrikanth Yalavarthi 873*fc54766bSSrikanth Yalavarthi #endif /* __ARM_FEATURE_BF16 */ 874