xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision 65282e9f8e118a4ca977d1aee2d7f51f44e9bc1b)
1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause
2fc54766bSSrikanth Yalavarthi  * Copyright (c) 2022 Marvell.
3fc54766bSSrikanth Yalavarthi  */
4fc54766bSSrikanth Yalavarthi 
5fc54766bSSrikanth Yalavarthi #include <errno.h>
6fc54766bSSrikanth Yalavarthi #include <stdint.h>
7fc54766bSSrikanth Yalavarthi #include <stdlib.h>
8fc54766bSSrikanth Yalavarthi 
9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h"
10fc54766bSSrikanth Yalavarthi 
11fc54766bSSrikanth Yalavarthi #include <arm_neon.h>
12fc54766bSSrikanth Yalavarthi 
13fc54766bSSrikanth Yalavarthi /* Description:
14fc54766bSSrikanth Yalavarthi  * This file implements vector versions of Machine Learning utility functions used to convert data
15538f6997SSrikanth Yalavarthi  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16538f6997SSrikanth Yalavarthi  * is based on Arm Neon intrinsics.
17fc54766bSSrikanth Yalavarthi  */
18fc54766bSSrikanth Yalavarthi 
19fc54766bSSrikanth Yalavarthi static inline void
20*65282e9fSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(const float *input, int8_t *output, float scale, int8_t zero_point)
21fc54766bSSrikanth Yalavarthi {
22fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_l;
23fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_h;
24fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
25fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
26fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
27fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
28fc54766bSSrikanth Yalavarthi 
29fc54766bSSrikanth Yalavarthi 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
31fc54766bSSrikanth Yalavarthi 	 */
32fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
33*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
34*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
35fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
36fc54766bSSrikanth Yalavarthi 	s16x4_l = vqmovn_s32(s32x4);
37fc54766bSSrikanth Yalavarthi 
38fc54766bSSrikanth Yalavarthi 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
39fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
40fc54766bSSrikanth Yalavarthi 	 */
41fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
42*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
43*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
44fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
45fc54766bSSrikanth Yalavarthi 	s16x4_h = vqmovn_s32(s32x4);
46fc54766bSSrikanth Yalavarthi 
47fc54766bSSrikanth Yalavarthi 	/* combine lower and higher int16x4_t to int16x8_t */
48fc54766bSSrikanth Yalavarthi 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
49fc54766bSSrikanth Yalavarthi 
50fc54766bSSrikanth Yalavarthi 	/* narrow to int8_t */
51fc54766bSSrikanth Yalavarthi 	s8x8 = vqmovn_s16(s16x8);
52*65282e9fSSrikanth Yalavarthi 	s8x8 = vmax_s8(s8x8, vdup_n_s8(INT8_MIN + 1));
53fc54766bSSrikanth Yalavarthi 
54fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
55fc54766bSSrikanth Yalavarthi 	vst1_s8(output, s8x8);
56fc54766bSSrikanth Yalavarthi }
57fc54766bSSrikanth Yalavarthi 
58fc54766bSSrikanth Yalavarthi static inline void
59*65282e9fSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(const float *input, int8_t *output, float scale, int8_t zero_point)
60fc54766bSSrikanth Yalavarthi {
61*65282e9fSSrikanth Yalavarthi 	float32x2_t f32x2;
62*65282e9fSSrikanth Yalavarthi 	int32x2_t s32x2;
63fc54766bSSrikanth Yalavarthi 	int16_t s16;
64fc54766bSSrikanth Yalavarthi 
65fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
66*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
67*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
68*65282e9fSSrikanth Yalavarthi 	s32x2 = vcvta_s32_f32(f32x2);
69*65282e9fSSrikanth Yalavarthi 	s32x2 = vmax_s32(s32x2, vdup_n_s32(INT8_MIN + 1));
70fc54766bSSrikanth Yalavarthi 
71fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
72*65282e9fSSrikanth Yalavarthi 	s16 = vqmovns_s32(vget_lane_s32(s32x2, 0));
73fc54766bSSrikanth Yalavarthi 
74fc54766bSSrikanth Yalavarthi 	/* convert to int8_t */
75fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_s16(s16);
76fc54766bSSrikanth Yalavarthi }
77fc54766bSSrikanth Yalavarthi 
78fc54766bSSrikanth Yalavarthi int
79*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale,
80*65282e9fSSrikanth Yalavarthi 			  int8_t zero_point)
81fc54766bSSrikanth Yalavarthi {
82*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
83fc54766bSSrikanth Yalavarthi 	int8_t *output_buffer;
84fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
85fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
86fc54766bSSrikanth Yalavarthi 	uint64_t i;
87fc54766bSSrikanth Yalavarthi 
88fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
89fc54766bSSrikanth Yalavarthi 		return -EINVAL;
90fc54766bSSrikanth Yalavarthi 
91*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
92fc54766bSSrikanth Yalavarthi 	output_buffer = (int8_t *)output;
93fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
94fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
95fc54766bSSrikanth Yalavarthi 
96fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
97fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
98*65282e9fSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x8(input_buffer, output_buffer, scale, zero_point);
99fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
100fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
101fc54766bSSrikanth Yalavarthi 	}
102fc54766bSSrikanth Yalavarthi 
103fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
104fc54766bSSrikanth Yalavarthi 	i = i * vlen;
105fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
106*65282e9fSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x1(input_buffer, output_buffer, scale, zero_point);
107fc54766bSSrikanth Yalavarthi 		input_buffer++;
108fc54766bSSrikanth Yalavarthi 		output_buffer++;
109fc54766bSSrikanth Yalavarthi 	}
110fc54766bSSrikanth Yalavarthi 
111fc54766bSSrikanth Yalavarthi 	return 0;
112fc54766bSSrikanth Yalavarthi }
113fc54766bSSrikanth Yalavarthi 
114fc54766bSSrikanth Yalavarthi static inline void
115*65282e9fSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(const int8_t *input, float *output, float scale, int8_t zero_point)
116fc54766bSSrikanth Yalavarthi {
117fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
118fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
119fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
120fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
121fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
122fc54766bSSrikanth Yalavarthi 
123fc54766bSSrikanth Yalavarthi 	/* load 8 x int8_t elements */
124fc54766bSSrikanth Yalavarthi 	s8x8 = vld1_s8(input);
125fc54766bSSrikanth Yalavarthi 
126fc54766bSSrikanth Yalavarthi 	/* widen int8_t to int16_t */
127fc54766bSSrikanth Yalavarthi 	s16x8 = vmovl_s8(s8x8);
128fc54766bSSrikanth Yalavarthi 
129fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
130fc54766bSSrikanth Yalavarthi 	s16x4 = vget_low_s16(s16x8);
131fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
132fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
133*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
134fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
135fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
136fc54766bSSrikanth Yalavarthi 
137fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
138fc54766bSSrikanth Yalavarthi 	s16x4 = vget_high_s16(s16x8);
139fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
140fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
141*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
142fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
143fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
144fc54766bSSrikanth Yalavarthi }
145fc54766bSSrikanth Yalavarthi 
146fc54766bSSrikanth Yalavarthi static inline void
147*65282e9fSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(const int8_t *input, float *output, float scale, int8_t zero_point)
148fc54766bSSrikanth Yalavarthi {
149*65282e9fSSrikanth Yalavarthi 	*output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point);
150fc54766bSSrikanth Yalavarthi }
151fc54766bSSrikanth Yalavarthi 
152fc54766bSSrikanth Yalavarthi int
153*65282e9fSSrikanth Yalavarthi rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
154*65282e9fSSrikanth Yalavarthi 			  int8_t zero_point)
155fc54766bSSrikanth Yalavarthi {
156*65282e9fSSrikanth Yalavarthi 	const int8_t *input_buffer;
157fc54766bSSrikanth Yalavarthi 	float *output_buffer;
158fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
159fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
160fc54766bSSrikanth Yalavarthi 	uint64_t i;
161fc54766bSSrikanth Yalavarthi 
162fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
163fc54766bSSrikanth Yalavarthi 		return -EINVAL;
164fc54766bSSrikanth Yalavarthi 
165*65282e9fSSrikanth Yalavarthi 	input_buffer = (const int8_t *)input;
166fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
167fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
168fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
169fc54766bSSrikanth Yalavarthi 
170fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
171fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
172*65282e9fSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point);
173fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
174fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
175fc54766bSSrikanth Yalavarthi 	}
176fc54766bSSrikanth Yalavarthi 
177fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
178fc54766bSSrikanth Yalavarthi 	i = i * vlen;
179fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
180*65282e9fSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
181fc54766bSSrikanth Yalavarthi 		input_buffer++;
182fc54766bSSrikanth Yalavarthi 		output_buffer++;
183fc54766bSSrikanth Yalavarthi 	}
184fc54766bSSrikanth Yalavarthi 
185fc54766bSSrikanth Yalavarthi 	return 0;
186fc54766bSSrikanth Yalavarthi }
187fc54766bSSrikanth Yalavarthi 
188fc54766bSSrikanth Yalavarthi static inline void
189*65282e9fSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(const float *input, uint8_t *output, float scale, uint8_t zero_point)
190fc54766bSSrikanth Yalavarthi {
191fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_l;
192fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_h;
193fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
194fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
195fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
196fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
197fc54766bSSrikanth Yalavarthi 
198fc54766bSSrikanth Yalavarthi 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
199fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
200fc54766bSSrikanth Yalavarthi 	 */
201fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
202*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
203*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
204fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
205fc54766bSSrikanth Yalavarthi 	u16x4_l = vqmovn_u32(u32x4);
206fc54766bSSrikanth Yalavarthi 
207fc54766bSSrikanth Yalavarthi 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
208fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
209fc54766bSSrikanth Yalavarthi 	 */
210fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
211*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
212*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
213fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
214fc54766bSSrikanth Yalavarthi 	u16x4_h = vqmovn_u32(u32x4);
215fc54766bSSrikanth Yalavarthi 
216fc54766bSSrikanth Yalavarthi 	/* combine lower and higher uint16x4_t */
217fc54766bSSrikanth Yalavarthi 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
218fc54766bSSrikanth Yalavarthi 
219fc54766bSSrikanth Yalavarthi 	/* narrow to uint8x8_t */
220fc54766bSSrikanth Yalavarthi 	u8x8 = vqmovn_u16(u16x8);
221fc54766bSSrikanth Yalavarthi 
222fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
223fc54766bSSrikanth Yalavarthi 	vst1_u8(output, u8x8);
224fc54766bSSrikanth Yalavarthi }
225fc54766bSSrikanth Yalavarthi 
226fc54766bSSrikanth Yalavarthi static inline void
227*65282e9fSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(const float *input, uint8_t *output, float scale, uint8_t zero_point)
228fc54766bSSrikanth Yalavarthi {
229*65282e9fSSrikanth Yalavarthi 	float32x2_t f32x2;
230*65282e9fSSrikanth Yalavarthi 	uint32x2_t u32x2;
231fc54766bSSrikanth Yalavarthi 	uint16_t u16;
232fc54766bSSrikanth Yalavarthi 
233fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
234*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
235*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
236*65282e9fSSrikanth Yalavarthi 	u32x2 = vcvta_u32_f32(f32x2);
237fc54766bSSrikanth Yalavarthi 
238fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
239*65282e9fSSrikanth Yalavarthi 	u16 = vqmovns_u32(vget_lane_u32(u32x2, 0));
240fc54766bSSrikanth Yalavarthi 
241fc54766bSSrikanth Yalavarthi 	/* convert to uint8_t */
242fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_u16(u16);
243fc54766bSSrikanth Yalavarthi }
244fc54766bSSrikanth Yalavarthi 
245fc54766bSSrikanth Yalavarthi int
246*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale,
247*65282e9fSSrikanth Yalavarthi 			   uint8_t zero_point)
248fc54766bSSrikanth Yalavarthi {
249*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
250fc54766bSSrikanth Yalavarthi 	uint8_t *output_buffer;
251fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
252fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
253fc54766bSSrikanth Yalavarthi 	uint64_t i;
254fc54766bSSrikanth Yalavarthi 
255fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
256fc54766bSSrikanth Yalavarthi 		return -EINVAL;
257fc54766bSSrikanth Yalavarthi 
258*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
259fc54766bSSrikanth Yalavarthi 	output_buffer = (uint8_t *)output;
260fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
261fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
262fc54766bSSrikanth Yalavarthi 
263fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
264fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
265*65282e9fSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x8(input_buffer, output_buffer, scale, zero_point);
266fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
267fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
268fc54766bSSrikanth Yalavarthi 	}
269fc54766bSSrikanth Yalavarthi 
270fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
271fc54766bSSrikanth Yalavarthi 	i = i * vlen;
272fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
273*65282e9fSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x1(input_buffer, output_buffer, scale, zero_point);
274fc54766bSSrikanth Yalavarthi 		input_buffer++;
275fc54766bSSrikanth Yalavarthi 		output_buffer++;
276fc54766bSSrikanth Yalavarthi 	}
277fc54766bSSrikanth Yalavarthi 
278fc54766bSSrikanth Yalavarthi 	return 0;
279fc54766bSSrikanth Yalavarthi }
280fc54766bSSrikanth Yalavarthi 
281fc54766bSSrikanth Yalavarthi static inline void
282*65282e9fSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(const uint8_t *input, float *output, float scale, uint8_t zero_point)
283fc54766bSSrikanth Yalavarthi {
284fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
285fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
286*65282e9fSSrikanth Yalavarthi 	int16x8_t s16x8;
287*65282e9fSSrikanth Yalavarthi 	int16x4_t s16x4;
288*65282e9fSSrikanth Yalavarthi 	int32x4_t s32x4;
289fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
290fc54766bSSrikanth Yalavarthi 
291fc54766bSSrikanth Yalavarthi 	/* load 8 x uint8_t elements */
292fc54766bSSrikanth Yalavarthi 	u8x8 = vld1_u8(input);
293fc54766bSSrikanth Yalavarthi 	u16x8 = vmovl_u8(u8x8);
294*65282e9fSSrikanth Yalavarthi 	s16x8 = vreinterpretq_s16_u16(u16x8);
295fc54766bSSrikanth Yalavarthi 
296fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
297*65282e9fSSrikanth Yalavarthi 	s16x4 = vget_low_s16(s16x8);
298*65282e9fSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
299*65282e9fSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
300*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
301fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
302fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
303fc54766bSSrikanth Yalavarthi 
304fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
305*65282e9fSSrikanth Yalavarthi 	s16x4 = vget_high_s16(s16x8);
306*65282e9fSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
307*65282e9fSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
308*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
309fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
310fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
311fc54766bSSrikanth Yalavarthi }
312fc54766bSSrikanth Yalavarthi 
313fc54766bSSrikanth Yalavarthi static inline void
314*65282e9fSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(const uint8_t *input, float *output, float scale, uint8_t zero_point)
315fc54766bSSrikanth Yalavarthi {
316*65282e9fSSrikanth Yalavarthi 	*output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point);
317fc54766bSSrikanth Yalavarthi }
318fc54766bSSrikanth Yalavarthi 
319fc54766bSSrikanth Yalavarthi int
320*65282e9fSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
321*65282e9fSSrikanth Yalavarthi 			   uint8_t zero_point)
322fc54766bSSrikanth Yalavarthi {
323*65282e9fSSrikanth Yalavarthi 	const uint8_t *input_buffer;
324fc54766bSSrikanth Yalavarthi 	float *output_buffer;
325fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
326fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
327fc54766bSSrikanth Yalavarthi 	uint64_t i;
328fc54766bSSrikanth Yalavarthi 
329fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
330fc54766bSSrikanth Yalavarthi 		return -EINVAL;
331fc54766bSSrikanth Yalavarthi 
332*65282e9fSSrikanth Yalavarthi 	input_buffer = (const uint8_t *)input;
333fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
334fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
335fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
336fc54766bSSrikanth Yalavarthi 
337fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
338fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
339*65282e9fSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point);
340fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
341fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
342fc54766bSSrikanth Yalavarthi 	}
343fc54766bSSrikanth Yalavarthi 
344fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
345fc54766bSSrikanth Yalavarthi 	i = i * vlen;
346fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
347*65282e9fSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
348fc54766bSSrikanth Yalavarthi 		input_buffer++;
349fc54766bSSrikanth Yalavarthi 		output_buffer++;
350fc54766bSSrikanth Yalavarthi 	}
351fc54766bSSrikanth Yalavarthi 
352fc54766bSSrikanth Yalavarthi 	return 0;
353fc54766bSSrikanth Yalavarthi }
354fc54766bSSrikanth Yalavarthi 
355fc54766bSSrikanth Yalavarthi static inline void
356*65282e9fSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(const float *input, int16_t *output, float scale, int16_t zero_point)
357fc54766bSSrikanth Yalavarthi {
358fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
359fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
360fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
361fc54766bSSrikanth Yalavarthi 
362fc54766bSSrikanth Yalavarthi 	/* load 4 x float elements */
363fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
364fc54766bSSrikanth Yalavarthi 
365fc54766bSSrikanth Yalavarthi 	/* scale */
366*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
367*65282e9fSSrikanth Yalavarthi 
368*65282e9fSSrikanth Yalavarthi 	/* add zero point */
369*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
370fc54766bSSrikanth Yalavarthi 
371fc54766bSSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
372fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
373fc54766bSSrikanth Yalavarthi 
374fc54766bSSrikanth Yalavarthi 	/* saturate narrow to int16x4_t */
375fc54766bSSrikanth Yalavarthi 	s16x4 = vqmovn_s32(s32x4);
376*65282e9fSSrikanth Yalavarthi 	s16x4 = vmax_s16(s16x4, vdup_n_s16(INT16_MIN + 1));
377fc54766bSSrikanth Yalavarthi 
378fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
379fc54766bSSrikanth Yalavarthi 	vst1_s16(output, s16x4);
380fc54766bSSrikanth Yalavarthi }
381fc54766bSSrikanth Yalavarthi 
382fc54766bSSrikanth Yalavarthi static inline void
383*65282e9fSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(const float *input, int16_t *output, float scale, int16_t zero_point)
384fc54766bSSrikanth Yalavarthi {
385*65282e9fSSrikanth Yalavarthi 	float32x2_t f32x2;
386*65282e9fSSrikanth Yalavarthi 	int32x2_t s32x2;
387fc54766bSSrikanth Yalavarthi 
388fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
389*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
390*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
391*65282e9fSSrikanth Yalavarthi 	s32x2 = vcvta_s32_f32(f32x2);
392*65282e9fSSrikanth Yalavarthi 	s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1));
393fc54766bSSrikanth Yalavarthi 
394fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
395*65282e9fSSrikanth Yalavarthi 	*output = vqmovns_s32(vget_lane_s32(s32x2, 0));
396fc54766bSSrikanth Yalavarthi }
397fc54766bSSrikanth Yalavarthi 
398fc54766bSSrikanth Yalavarthi int
399*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale,
400*65282e9fSSrikanth Yalavarthi 			   int16_t zero_point)
401fc54766bSSrikanth Yalavarthi {
402*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
403fc54766bSSrikanth Yalavarthi 	int16_t *output_buffer;
404fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
405fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
406fc54766bSSrikanth Yalavarthi 	uint64_t i;
407fc54766bSSrikanth Yalavarthi 
408fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
409fc54766bSSrikanth Yalavarthi 		return -EINVAL;
410fc54766bSSrikanth Yalavarthi 
411*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
412fc54766bSSrikanth Yalavarthi 	output_buffer = (int16_t *)output;
413fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
414fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
415fc54766bSSrikanth Yalavarthi 
416fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
417fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
418*65282e9fSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x4(input_buffer, output_buffer, scale, zero_point);
419fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
420fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
421fc54766bSSrikanth Yalavarthi 	}
422fc54766bSSrikanth Yalavarthi 
423fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
424fc54766bSSrikanth Yalavarthi 	i = i * vlen;
425fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
426*65282e9fSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x1(input_buffer, output_buffer, scale, zero_point);
427fc54766bSSrikanth Yalavarthi 		input_buffer++;
428fc54766bSSrikanth Yalavarthi 		output_buffer++;
429fc54766bSSrikanth Yalavarthi 	}
430fc54766bSSrikanth Yalavarthi 
431fc54766bSSrikanth Yalavarthi 	return 0;
432fc54766bSSrikanth Yalavarthi }
433fc54766bSSrikanth Yalavarthi 
434fc54766bSSrikanth Yalavarthi static inline void
435*65282e9fSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(const int16_t *input, float *output, float scale, int16_t zero_point)
436fc54766bSSrikanth Yalavarthi {
437fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
438fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
439fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
440fc54766bSSrikanth Yalavarthi 
441fc54766bSSrikanth Yalavarthi 	/* load 4 x int16_t elements */
442fc54766bSSrikanth Yalavarthi 	s16x4 = vld1_s16(input);
443fc54766bSSrikanth Yalavarthi 
444fc54766bSSrikanth Yalavarthi 	/* widen int16_t to int32_t */
445fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
446fc54766bSSrikanth Yalavarthi 
447fc54766bSSrikanth Yalavarthi 	/* convert int32_t to float */
448fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
449fc54766bSSrikanth Yalavarthi 
450*65282e9fSSrikanth Yalavarthi 	/* subtract zero point */
451*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
452*65282e9fSSrikanth Yalavarthi 
453fc54766bSSrikanth Yalavarthi 	/* scale */
454fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
455fc54766bSSrikanth Yalavarthi 
456fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
457fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
458fc54766bSSrikanth Yalavarthi }
459fc54766bSSrikanth Yalavarthi 
460fc54766bSSrikanth Yalavarthi static inline void
461*65282e9fSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(const int16_t *input, float *output, float scale, int16_t zero_point)
462fc54766bSSrikanth Yalavarthi {
463*65282e9fSSrikanth Yalavarthi 	*output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point);
464fc54766bSSrikanth Yalavarthi }
465fc54766bSSrikanth Yalavarthi 
466fc54766bSSrikanth Yalavarthi int
467*65282e9fSSrikanth Yalavarthi rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
468*65282e9fSSrikanth Yalavarthi 			   int16_t zero_point)
469fc54766bSSrikanth Yalavarthi {
470*65282e9fSSrikanth Yalavarthi 	const int16_t *input_buffer;
471fc54766bSSrikanth Yalavarthi 	float *output_buffer;
472fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
473fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
474fc54766bSSrikanth Yalavarthi 	uint64_t i;
475fc54766bSSrikanth Yalavarthi 
476fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
477fc54766bSSrikanth Yalavarthi 		return -EINVAL;
478fc54766bSSrikanth Yalavarthi 
479*65282e9fSSrikanth Yalavarthi 	input_buffer = (const int16_t *)input;
480fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
481fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
482fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
483fc54766bSSrikanth Yalavarthi 
484fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
485fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
486*65282e9fSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
487fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
488fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
489fc54766bSSrikanth Yalavarthi 	}
490fc54766bSSrikanth Yalavarthi 
491fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
492fc54766bSSrikanth Yalavarthi 	i = i * vlen;
493fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
494*65282e9fSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
495fc54766bSSrikanth Yalavarthi 		input_buffer++;
496fc54766bSSrikanth Yalavarthi 		output_buffer++;
497fc54766bSSrikanth Yalavarthi 	}
498fc54766bSSrikanth Yalavarthi 
499fc54766bSSrikanth Yalavarthi 	return 0;
500fc54766bSSrikanth Yalavarthi }
501fc54766bSSrikanth Yalavarthi 
502fc54766bSSrikanth Yalavarthi static inline void
503*65282e9fSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(const float *input, uint16_t *output, float scale,
504*65282e9fSSrikanth Yalavarthi 			       uint16_t zero_point)
505fc54766bSSrikanth Yalavarthi {
506fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
507fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
508fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
509fc54766bSSrikanth Yalavarthi 
510fc54766bSSrikanth Yalavarthi 	/* load 4 float elements */
511fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
512fc54766bSSrikanth Yalavarthi 
513fc54766bSSrikanth Yalavarthi 	/* scale */
514*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
515*65282e9fSSrikanth Yalavarthi 
516*65282e9fSSrikanth Yalavarthi 	/* add zero point */
517*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
518fc54766bSSrikanth Yalavarthi 
519fc54766bSSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
520fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
521fc54766bSSrikanth Yalavarthi 
522fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
523fc54766bSSrikanth Yalavarthi 	u16x4 = vqmovn_u32(u32x4);
524fc54766bSSrikanth Yalavarthi 
525fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
526fc54766bSSrikanth Yalavarthi 	vst1_u16(output, u16x4);
527fc54766bSSrikanth Yalavarthi }
528fc54766bSSrikanth Yalavarthi 
529fc54766bSSrikanth Yalavarthi static inline void
530*65282e9fSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(const float *input, uint16_t *output, float scale,
531*65282e9fSSrikanth Yalavarthi 			       uint16_t zero_point)
532fc54766bSSrikanth Yalavarthi {
533fc54766bSSrikanth Yalavarthi 	uint32_t u32;
534fc54766bSSrikanth Yalavarthi 
535fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
536*65282e9fSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32((*input) / scale + (float)zero_point);
537fc54766bSSrikanth Yalavarthi 
538fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
539*65282e9fSSrikanth Yalavarthi 	*output = vqmovns_u32(u32) + zero_point;
540fc54766bSSrikanth Yalavarthi }
541fc54766bSSrikanth Yalavarthi 
542fc54766bSSrikanth Yalavarthi int
543*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale,
544*65282e9fSSrikanth Yalavarthi 			   uint16_t zero_point)
545fc54766bSSrikanth Yalavarthi {
546*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
547fc54766bSSrikanth Yalavarthi 	uint16_t *output_buffer;
548fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
549fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
550fc54766bSSrikanth Yalavarthi 	uint64_t i;
551fc54766bSSrikanth Yalavarthi 
552fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
553fc54766bSSrikanth Yalavarthi 		return -EINVAL;
554fc54766bSSrikanth Yalavarthi 
555*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
556fc54766bSSrikanth Yalavarthi 	output_buffer = (uint16_t *)output;
557fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
558fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
559fc54766bSSrikanth Yalavarthi 
560fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
561fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
562*65282e9fSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x4(input_buffer, output_buffer, scale, zero_point);
563fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
564fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
565fc54766bSSrikanth Yalavarthi 	}
566fc54766bSSrikanth Yalavarthi 
567fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
568fc54766bSSrikanth Yalavarthi 	i = i * vlen;
569fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
570*65282e9fSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x1(input_buffer, output_buffer, scale, zero_point);
571fc54766bSSrikanth Yalavarthi 		input_buffer++;
572fc54766bSSrikanth Yalavarthi 		output_buffer++;
573fc54766bSSrikanth Yalavarthi 	}
574fc54766bSSrikanth Yalavarthi 
575fc54766bSSrikanth Yalavarthi 	return 0;
576fc54766bSSrikanth Yalavarthi }
577fc54766bSSrikanth Yalavarthi 
578fc54766bSSrikanth Yalavarthi static inline void
579*65282e9fSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(const uint16_t *input, float *output, float scale,
580*65282e9fSSrikanth Yalavarthi 			       uint16_t zero_point)
581fc54766bSSrikanth Yalavarthi {
582fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
583fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
584fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
585fc54766bSSrikanth Yalavarthi 
586fc54766bSSrikanth Yalavarthi 	/* load 4 x uint16_t elements */
587fc54766bSSrikanth Yalavarthi 	u16x4 = vld1_u16(input);
588fc54766bSSrikanth Yalavarthi 
589fc54766bSSrikanth Yalavarthi 	/* widen uint16_t to uint32_t */
590fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
591fc54766bSSrikanth Yalavarthi 
592fc54766bSSrikanth Yalavarthi 	/* convert uint32_t to float */
593fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
594fc54766bSSrikanth Yalavarthi 
595*65282e9fSSrikanth Yalavarthi 	/* subtract zero point */
596*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
597*65282e9fSSrikanth Yalavarthi 
598fc54766bSSrikanth Yalavarthi 	/* scale */
599fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
600fc54766bSSrikanth Yalavarthi 
601fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
602fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
603fc54766bSSrikanth Yalavarthi }
604fc54766bSSrikanth Yalavarthi 
605fc54766bSSrikanth Yalavarthi static inline void
606*65282e9fSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(const uint16_t *input, float *output, float scale,
607*65282e9fSSrikanth Yalavarthi 			       uint16_t zero_point)
608fc54766bSSrikanth Yalavarthi {
609*65282e9fSSrikanth Yalavarthi 	*output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point);
610fc54766bSSrikanth Yalavarthi }
611fc54766bSSrikanth Yalavarthi 
612fc54766bSSrikanth Yalavarthi int
613*65282e9fSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
614*65282e9fSSrikanth Yalavarthi 			   uint16_t zero_point)
615fc54766bSSrikanth Yalavarthi {
616*65282e9fSSrikanth Yalavarthi 	const uint16_t *input_buffer;
617fc54766bSSrikanth Yalavarthi 	float *output_buffer;
618fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
619fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
620fc54766bSSrikanth Yalavarthi 	uint64_t i;
621fc54766bSSrikanth Yalavarthi 
622fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
623fc54766bSSrikanth Yalavarthi 		return -EINVAL;
624fc54766bSSrikanth Yalavarthi 
625*65282e9fSSrikanth Yalavarthi 	input_buffer = (const uint16_t *)input;
626fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
627fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
628fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
629fc54766bSSrikanth Yalavarthi 
630fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
631fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
632*65282e9fSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
633fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
634fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
635fc54766bSSrikanth Yalavarthi 	}
636fc54766bSSrikanth Yalavarthi 
637fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
638fc54766bSSrikanth Yalavarthi 	i = i * vlen;
639fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
640*65282e9fSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
641fc54766bSSrikanth Yalavarthi 		input_buffer++;
642fc54766bSSrikanth Yalavarthi 		output_buffer++;
643fc54766bSSrikanth Yalavarthi 	}
644fc54766bSSrikanth Yalavarthi 
645fc54766bSSrikanth Yalavarthi 	return 0;
646fc54766bSSrikanth Yalavarthi }
647fc54766bSSrikanth Yalavarthi 
648fc54766bSSrikanth Yalavarthi static inline void
649*65282e9fSSrikanth Yalavarthi __float32_to_int32_neon_s32x4(const float *input, int32_t *output, float scale, int32_t zero_point)
65050513ae5SSrikanth Yalavarthi {
65150513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
65250513ae5SSrikanth Yalavarthi 	int32x4_t s32x4;
65350513ae5SSrikanth Yalavarthi 
65450513ae5SSrikanth Yalavarthi 	/* load 4 x float elements */
65550513ae5SSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
65650513ae5SSrikanth Yalavarthi 
65750513ae5SSrikanth Yalavarthi 	/* scale */
658*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
659*65282e9fSSrikanth Yalavarthi 
660*65282e9fSSrikanth Yalavarthi 	/* add zero point */
661*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
66250513ae5SSrikanth Yalavarthi 
66350513ae5SSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
66450513ae5SSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
66550513ae5SSrikanth Yalavarthi 
666*65282e9fSSrikanth Yalavarthi 	/* add zero_point */
667*65282e9fSSrikanth Yalavarthi 	s32x4 = vaddq_s32(s32x4, vdupq_n_s32(zero_point));
668*65282e9fSSrikanth Yalavarthi 	s32x4 = vmaxq_s32(s32x4, vdupq_n_s32(INT32_MIN + 1));
669*65282e9fSSrikanth Yalavarthi 
67050513ae5SSrikanth Yalavarthi 	/* store 4 elements */
67150513ae5SSrikanth Yalavarthi 	vst1q_s32(output, s32x4);
67250513ae5SSrikanth Yalavarthi }
67350513ae5SSrikanth Yalavarthi 
67450513ae5SSrikanth Yalavarthi static inline void
675*65282e9fSSrikanth Yalavarthi __float32_to_int32_neon_s32x1(const float *input, int32_t *output, float scale, int32_t zero_point)
67650513ae5SSrikanth Yalavarthi {
677*65282e9fSSrikanth Yalavarthi 	float32x2_t f32x2;
678*65282e9fSSrikanth Yalavarthi 	int32x2_t s32x2;
679*65282e9fSSrikanth Yalavarthi 
68050513ae5SSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
681*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
682*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
683*65282e9fSSrikanth Yalavarthi 	s32x2 = vcvta_s32_f32(f32x2);
684*65282e9fSSrikanth Yalavarthi 	s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1));
685*65282e9fSSrikanth Yalavarthi 
686*65282e9fSSrikanth Yalavarthi 	/* saturate narrow */
687*65282e9fSSrikanth Yalavarthi 	vst1_lane_s32(output, s32x2, 0);
68850513ae5SSrikanth Yalavarthi }
68950513ae5SSrikanth Yalavarthi 
69050513ae5SSrikanth Yalavarthi int
691*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale,
692*65282e9fSSrikanth Yalavarthi 			   int32_t zero_point)
69350513ae5SSrikanth Yalavarthi {
694*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
69550513ae5SSrikanth Yalavarthi 	int32_t *output_buffer;
69650513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
69750513ae5SSrikanth Yalavarthi 	uint32_t vlen;
69850513ae5SSrikanth Yalavarthi 	uint64_t i;
69950513ae5SSrikanth Yalavarthi 
70050513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
70150513ae5SSrikanth Yalavarthi 		return -EINVAL;
70250513ae5SSrikanth Yalavarthi 
703*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
70450513ae5SSrikanth Yalavarthi 	output_buffer = (int32_t *)output;
70550513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int32_t);
70650513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
70750513ae5SSrikanth Yalavarthi 
70850513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
70950513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
710*65282e9fSSrikanth Yalavarthi 		__float32_to_int32_neon_s32x4(input_buffer, output_buffer, scale, zero_point);
71150513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
71250513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
71350513ae5SSrikanth Yalavarthi 	}
71450513ae5SSrikanth Yalavarthi 
71550513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
71650513ae5SSrikanth Yalavarthi 	i = i * vlen;
71750513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
718*65282e9fSSrikanth Yalavarthi 		__float32_to_int32_neon_s32x1(input_buffer, output_buffer, scale, zero_point);
71950513ae5SSrikanth Yalavarthi 		input_buffer++;
72050513ae5SSrikanth Yalavarthi 		output_buffer++;
72150513ae5SSrikanth Yalavarthi 	}
72250513ae5SSrikanth Yalavarthi 
72350513ae5SSrikanth Yalavarthi 	return 0;
72450513ae5SSrikanth Yalavarthi }
72550513ae5SSrikanth Yalavarthi 
72650513ae5SSrikanth Yalavarthi static inline void
727*65282e9fSSrikanth Yalavarthi __int32_to_float32_neon_f32x4(const int32_t *input, float *output, float scale, int32_t zero_point)
72850513ae5SSrikanth Yalavarthi {
72950513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
73050513ae5SSrikanth Yalavarthi 	int32x4_t s32x4;
73150513ae5SSrikanth Yalavarthi 
73250513ae5SSrikanth Yalavarthi 	/* load 4 x int32_t elements */
73350513ae5SSrikanth Yalavarthi 	s32x4 = vld1q_s32(input);
73450513ae5SSrikanth Yalavarthi 
73550513ae5SSrikanth Yalavarthi 	/* convert int32_t to float */
73650513ae5SSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
73750513ae5SSrikanth Yalavarthi 
738*65282e9fSSrikanth Yalavarthi 	/* subtract zero point */
739*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
740*65282e9fSSrikanth Yalavarthi 
74150513ae5SSrikanth Yalavarthi 	/* scale */
74250513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
74350513ae5SSrikanth Yalavarthi 
74450513ae5SSrikanth Yalavarthi 	/* store float32x4_t */
74550513ae5SSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
74650513ae5SSrikanth Yalavarthi }
74750513ae5SSrikanth Yalavarthi 
74850513ae5SSrikanth Yalavarthi static inline void
749*65282e9fSSrikanth Yalavarthi __int32_to_float32_neon_f32x1(const int32_t *input, float *output, float scale, int32_t zero_point)
75050513ae5SSrikanth Yalavarthi {
751*65282e9fSSrikanth Yalavarthi 	*output = scale * (vcvts_f32_s32(*input) - (float)zero_point);
75250513ae5SSrikanth Yalavarthi }
75350513ae5SSrikanth Yalavarthi 
75450513ae5SSrikanth Yalavarthi int
755*65282e9fSSrikanth Yalavarthi rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
756*65282e9fSSrikanth Yalavarthi 			   int32_t zero_point)
75750513ae5SSrikanth Yalavarthi {
758*65282e9fSSrikanth Yalavarthi 	const int32_t *input_buffer;
75950513ae5SSrikanth Yalavarthi 	float *output_buffer;
76050513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
76150513ae5SSrikanth Yalavarthi 	uint32_t vlen;
76250513ae5SSrikanth Yalavarthi 	uint64_t i;
76350513ae5SSrikanth Yalavarthi 
76450513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
76550513ae5SSrikanth Yalavarthi 		return -EINVAL;
76650513ae5SSrikanth Yalavarthi 
767*65282e9fSSrikanth Yalavarthi 	input_buffer = (const int32_t *)input;
76850513ae5SSrikanth Yalavarthi 	output_buffer = (float *)output;
76950513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int32_t);
77050513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
77150513ae5SSrikanth Yalavarthi 
77250513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
77350513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
774*65282e9fSSrikanth Yalavarthi 		__int32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
77550513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
77650513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
77750513ae5SSrikanth Yalavarthi 	}
77850513ae5SSrikanth Yalavarthi 
77950513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
78050513ae5SSrikanth Yalavarthi 	i = i * vlen;
78150513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
782*65282e9fSSrikanth Yalavarthi 		__int32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
78350513ae5SSrikanth Yalavarthi 		input_buffer++;
78450513ae5SSrikanth Yalavarthi 		output_buffer++;
78550513ae5SSrikanth Yalavarthi 	}
78650513ae5SSrikanth Yalavarthi 
78750513ae5SSrikanth Yalavarthi 	return 0;
78850513ae5SSrikanth Yalavarthi }
78950513ae5SSrikanth Yalavarthi 
79050513ae5SSrikanth Yalavarthi static inline void
791*65282e9fSSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(const float *input, uint32_t *output, float scale,
792*65282e9fSSrikanth Yalavarthi 			       uint32_t zero_point)
79350513ae5SSrikanth Yalavarthi {
79450513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
79550513ae5SSrikanth Yalavarthi 	uint32x4_t u32x4;
79650513ae5SSrikanth Yalavarthi 
79750513ae5SSrikanth Yalavarthi 	/* load 4 float elements */
79850513ae5SSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
79950513ae5SSrikanth Yalavarthi 
80050513ae5SSrikanth Yalavarthi 	/* scale */
801*65282e9fSSrikanth Yalavarthi 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
802*65282e9fSSrikanth Yalavarthi 
803*65282e9fSSrikanth Yalavarthi 	/* add zero point */
804*65282e9fSSrikanth Yalavarthi 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
80550513ae5SSrikanth Yalavarthi 
80650513ae5SSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
80750513ae5SSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
80850513ae5SSrikanth Yalavarthi 
80950513ae5SSrikanth Yalavarthi 	/* store 4 elements */
81050513ae5SSrikanth Yalavarthi 	vst1q_u32(output, u32x4);
81150513ae5SSrikanth Yalavarthi }
81250513ae5SSrikanth Yalavarthi 
81350513ae5SSrikanth Yalavarthi static inline void
814*65282e9fSSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(const float *input, uint32_t *output, float scale,
815*65282e9fSSrikanth Yalavarthi 			       uint32_t zero_point)
81650513ae5SSrikanth Yalavarthi {
81750513ae5SSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
818*65282e9fSSrikanth Yalavarthi 	*output = vcvtas_u32_f32((*input) / scale + (float)zero_point);
81950513ae5SSrikanth Yalavarthi }
82050513ae5SSrikanth Yalavarthi 
82150513ae5SSrikanth Yalavarthi int
822*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale,
823*65282e9fSSrikanth Yalavarthi 			   uint32_t zero_point)
82450513ae5SSrikanth Yalavarthi {
825*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
82650513ae5SSrikanth Yalavarthi 	uint32_t *output_buffer;
82750513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
82850513ae5SSrikanth Yalavarthi 	uint64_t vlen;
82950513ae5SSrikanth Yalavarthi 	uint64_t i;
83050513ae5SSrikanth Yalavarthi 
83150513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
83250513ae5SSrikanth Yalavarthi 		return -EINVAL;
83350513ae5SSrikanth Yalavarthi 
834*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
83550513ae5SSrikanth Yalavarthi 	output_buffer = (uint32_t *)output;
83650513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
83750513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
83850513ae5SSrikanth Yalavarthi 
83950513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
84050513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
841*65282e9fSSrikanth Yalavarthi 		__float32_to_uint32_neon_u32x4(input_buffer, output_buffer, scale, zero_point);
84250513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
84350513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
84450513ae5SSrikanth Yalavarthi 	}
84550513ae5SSrikanth Yalavarthi 
84650513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
84750513ae5SSrikanth Yalavarthi 	i = i * vlen;
84850513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
849*65282e9fSSrikanth Yalavarthi 		__float32_to_uint32_neon_u32x1(input_buffer, output_buffer, scale, zero_point);
85050513ae5SSrikanth Yalavarthi 		input_buffer++;
85150513ae5SSrikanth Yalavarthi 		output_buffer++;
85250513ae5SSrikanth Yalavarthi 	}
85350513ae5SSrikanth Yalavarthi 
85450513ae5SSrikanth Yalavarthi 	return 0;
85550513ae5SSrikanth Yalavarthi }
85650513ae5SSrikanth Yalavarthi 
85750513ae5SSrikanth Yalavarthi static inline void
858*65282e9fSSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(const uint32_t *input, float *output, float scale,
859*65282e9fSSrikanth Yalavarthi 			       uint32_t zero_point)
86050513ae5SSrikanth Yalavarthi {
86150513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
86250513ae5SSrikanth Yalavarthi 	uint32x4_t u32x4;
86350513ae5SSrikanth Yalavarthi 
86450513ae5SSrikanth Yalavarthi 	/* load 4 x uint32_t elements */
86550513ae5SSrikanth Yalavarthi 	u32x4 = vld1q_u32(input);
86650513ae5SSrikanth Yalavarthi 
86750513ae5SSrikanth Yalavarthi 	/* convert uint32_t to float */
86850513ae5SSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
86950513ae5SSrikanth Yalavarthi 
870*65282e9fSSrikanth Yalavarthi 	/* subtract zero point */
871*65282e9fSSrikanth Yalavarthi 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
872*65282e9fSSrikanth Yalavarthi 
87350513ae5SSrikanth Yalavarthi 	/* scale */
87450513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
87550513ae5SSrikanth Yalavarthi 
87650513ae5SSrikanth Yalavarthi 	/* store float32x4_t */
87750513ae5SSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
87850513ae5SSrikanth Yalavarthi }
87950513ae5SSrikanth Yalavarthi 
88050513ae5SSrikanth Yalavarthi static inline void
881*65282e9fSSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(const uint32_t *input, float *output, float scale,
882*65282e9fSSrikanth Yalavarthi 			       uint32_t zero_point)
88350513ae5SSrikanth Yalavarthi {
884*65282e9fSSrikanth Yalavarthi 	*output = scale * (vcvts_f32_u32(*input) - (float)zero_point);
88550513ae5SSrikanth Yalavarthi }
88650513ae5SSrikanth Yalavarthi 
88750513ae5SSrikanth Yalavarthi int
888*65282e9fSSrikanth Yalavarthi rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
889*65282e9fSSrikanth Yalavarthi 			   uint32_t zero_point)
89050513ae5SSrikanth Yalavarthi {
891*65282e9fSSrikanth Yalavarthi 	const uint32_t *input_buffer;
89250513ae5SSrikanth Yalavarthi 	float *output_buffer;
89350513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
89450513ae5SSrikanth Yalavarthi 	uint32_t vlen;
89550513ae5SSrikanth Yalavarthi 	uint64_t i;
89650513ae5SSrikanth Yalavarthi 
89750513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
89850513ae5SSrikanth Yalavarthi 		return -EINVAL;
89950513ae5SSrikanth Yalavarthi 
900*65282e9fSSrikanth Yalavarthi 	input_buffer = (const uint32_t *)input;
90150513ae5SSrikanth Yalavarthi 	output_buffer = (float *)output;
90250513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
90350513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
90450513ae5SSrikanth Yalavarthi 
90550513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
90650513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
907*65282e9fSSrikanth Yalavarthi 		__uint32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
90850513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
90950513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
91050513ae5SSrikanth Yalavarthi 	}
91150513ae5SSrikanth Yalavarthi 
91250513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
91350513ae5SSrikanth Yalavarthi 	i = i * vlen;
91450513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
915*65282e9fSSrikanth Yalavarthi 		__uint32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
91650513ae5SSrikanth Yalavarthi 		input_buffer++;
91750513ae5SSrikanth Yalavarthi 		output_buffer++;
91850513ae5SSrikanth Yalavarthi 	}
91950513ae5SSrikanth Yalavarthi 
92050513ae5SSrikanth Yalavarthi 	return 0;
92150513ae5SSrikanth Yalavarthi }
92250513ae5SSrikanth Yalavarthi 
92350513ae5SSrikanth Yalavarthi static inline void
924*65282e9fSSrikanth Yalavarthi __float32_to_int64_neon_s64x2(const float *input, int64_t *output, float scale, int64_t zero_point)
92542f3dcd9SSrikanth Yalavarthi {
92642f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
92742f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
92842f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
929*65282e9fSSrikanth Yalavarthi 	int64_t s64;
93042f3dcd9SSrikanth Yalavarthi 
93142f3dcd9SSrikanth Yalavarthi 	/* load 2 x float elements */
93242f3dcd9SSrikanth Yalavarthi 	f32x2 = vld1_f32(input);
93342f3dcd9SSrikanth Yalavarthi 
93442f3dcd9SSrikanth Yalavarthi 	/* scale */
935*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
936*65282e9fSSrikanth Yalavarthi 
937*65282e9fSSrikanth Yalavarthi 	/* add zero point */
938*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
93942f3dcd9SSrikanth Yalavarthi 
94042f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
94142f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
94242f3dcd9SSrikanth Yalavarthi 
94342f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
94442f3dcd9SSrikanth Yalavarthi 	s64x2 = vcvtaq_s64_f64(f64x2);
945*65282e9fSSrikanth Yalavarthi 	s64 = vgetq_lane_s64(s64x2, 0);
946*65282e9fSSrikanth Yalavarthi 	s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64;
94742f3dcd9SSrikanth Yalavarthi 
948*65282e9fSSrikanth Yalavarthi 	/* store lane 0 of int64x2_t */
949*65282e9fSSrikanth Yalavarthi 	*output = s64;
95042f3dcd9SSrikanth Yalavarthi }
95142f3dcd9SSrikanth Yalavarthi 
95242f3dcd9SSrikanth Yalavarthi static inline void
953*65282e9fSSrikanth Yalavarthi __float32_to_int64_neon_s64x1(const float *input, int64_t *output, float scale, int64_t zero_point)
95442f3dcd9SSrikanth Yalavarthi {
95542f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
95642f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
95742f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
958*65282e9fSSrikanth Yalavarthi 	int64_t s64;
95942f3dcd9SSrikanth Yalavarthi 
96042f3dcd9SSrikanth Yalavarthi 	/* load 1 x float element */
96142f3dcd9SSrikanth Yalavarthi 	f32x2 = vdup_n_f32(*input);
96242f3dcd9SSrikanth Yalavarthi 
96342f3dcd9SSrikanth Yalavarthi 	/* scale */
964*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
965*65282e9fSSrikanth Yalavarthi 
966*65282e9fSSrikanth Yalavarthi 	/* add zero point */
967*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
96842f3dcd9SSrikanth Yalavarthi 
96942f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
97042f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
97142f3dcd9SSrikanth Yalavarthi 
97242f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
97342f3dcd9SSrikanth Yalavarthi 	s64x2 = vcvtaq_s64_f64(f64x2);
974*65282e9fSSrikanth Yalavarthi 	s64 = vgetq_lane_s64(s64x2, 0);
975*65282e9fSSrikanth Yalavarthi 	s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64;
97642f3dcd9SSrikanth Yalavarthi 
97742f3dcd9SSrikanth Yalavarthi 	/* store lane 0 of int64x2_t */
978*65282e9fSSrikanth Yalavarthi 	*output = s64;
97942f3dcd9SSrikanth Yalavarthi }
98042f3dcd9SSrikanth Yalavarthi 
98142f3dcd9SSrikanth Yalavarthi int
982*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale,
983*65282e9fSSrikanth Yalavarthi 			   int64_t zero_point)
98442f3dcd9SSrikanth Yalavarthi {
985*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
98642f3dcd9SSrikanth Yalavarthi 	int64_t *output_buffer;
98742f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
98842f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
98942f3dcd9SSrikanth Yalavarthi 	uint64_t i;
99042f3dcd9SSrikanth Yalavarthi 
99142f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
99242f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
99342f3dcd9SSrikanth Yalavarthi 
994*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
99542f3dcd9SSrikanth Yalavarthi 	output_buffer = (int64_t *)output;
99642f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(int64_t);
99742f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
99842f3dcd9SSrikanth Yalavarthi 
99942f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
100042f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1001*65282e9fSSrikanth Yalavarthi 		__float32_to_int64_neon_s64x2(input_buffer, output_buffer, scale, zero_point);
100242f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
100342f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
100442f3dcd9SSrikanth Yalavarthi 	}
100542f3dcd9SSrikanth Yalavarthi 
100642f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
100742f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
100842f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1009*65282e9fSSrikanth Yalavarthi 		__float32_to_int64_neon_s64x1(input_buffer, output_buffer, scale, zero_point);
101042f3dcd9SSrikanth Yalavarthi 		input_buffer++;
101142f3dcd9SSrikanth Yalavarthi 		output_buffer++;
101242f3dcd9SSrikanth Yalavarthi 	}
101342f3dcd9SSrikanth Yalavarthi 
101442f3dcd9SSrikanth Yalavarthi 	return 0;
101542f3dcd9SSrikanth Yalavarthi }
101642f3dcd9SSrikanth Yalavarthi 
101742f3dcd9SSrikanth Yalavarthi static inline void
1018*65282e9fSSrikanth Yalavarthi __int64_to_float32_neon_f32x2(const int64_t *input, float *output, float scale, int64_t zero_point)
101942f3dcd9SSrikanth Yalavarthi {
102042f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
102142f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
102242f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
102342f3dcd9SSrikanth Yalavarthi 
102442f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
102542f3dcd9SSrikanth Yalavarthi 	s64x2 = vld1q_s64(input);
102642f3dcd9SSrikanth Yalavarthi 
102742f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
102842f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_s64(s64x2);
102942f3dcd9SSrikanth Yalavarthi 
103042f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
103142f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
103242f3dcd9SSrikanth Yalavarthi 
1033*65282e9fSSrikanth Yalavarthi 	/* subtract zero_point */
1034*65282e9fSSrikanth Yalavarthi 	f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point));
1035*65282e9fSSrikanth Yalavarthi 
103642f3dcd9SSrikanth Yalavarthi 	/* scale */
103742f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
103842f3dcd9SSrikanth Yalavarthi 
103942f3dcd9SSrikanth Yalavarthi 	/* store float32x2_t */
104042f3dcd9SSrikanth Yalavarthi 	vst1_f32(output, f32x2);
104142f3dcd9SSrikanth Yalavarthi }
104242f3dcd9SSrikanth Yalavarthi 
104342f3dcd9SSrikanth Yalavarthi static inline void
1044*65282e9fSSrikanth Yalavarthi __int64_to_float32_neon_f32x1(const int64_t *input, float *output, float scale, int64_t zero_point)
104542f3dcd9SSrikanth Yalavarthi {
104642f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
104742f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
104842f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
104942f3dcd9SSrikanth Yalavarthi 
105042f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
105142f3dcd9SSrikanth Yalavarthi 	s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0);
105242f3dcd9SSrikanth Yalavarthi 
105342f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
105442f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_s64(s64x2);
105542f3dcd9SSrikanth Yalavarthi 
105642f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
105742f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
105842f3dcd9SSrikanth Yalavarthi 
1059*65282e9fSSrikanth Yalavarthi 	/* subtract zero_point */
1060*65282e9fSSrikanth Yalavarthi 	f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point));
1061*65282e9fSSrikanth Yalavarthi 
106242f3dcd9SSrikanth Yalavarthi 	/* scale */
106342f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
106442f3dcd9SSrikanth Yalavarthi 
1065*65282e9fSSrikanth Yalavarthi 	/* store float32x2_t lane 0 */
106642f3dcd9SSrikanth Yalavarthi 	vst1_lane_f32(output, f32x2, 0);
106742f3dcd9SSrikanth Yalavarthi }
106842f3dcd9SSrikanth Yalavarthi 
106942f3dcd9SSrikanth Yalavarthi int
1070*65282e9fSSrikanth Yalavarthi rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
1071*65282e9fSSrikanth Yalavarthi 			   int64_t zero_point)
107242f3dcd9SSrikanth Yalavarthi {
1073*65282e9fSSrikanth Yalavarthi 	const int64_t *input_buffer;
107442f3dcd9SSrikanth Yalavarthi 	float *output_buffer;
107542f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
107642f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
107742f3dcd9SSrikanth Yalavarthi 	uint64_t i;
107842f3dcd9SSrikanth Yalavarthi 
107942f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
108042f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
108142f3dcd9SSrikanth Yalavarthi 
1082*65282e9fSSrikanth Yalavarthi 	input_buffer = (const int64_t *)input;
108342f3dcd9SSrikanth Yalavarthi 	output_buffer = (float *)output;
108442f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(int64_t);
108542f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
108642f3dcd9SSrikanth Yalavarthi 
108742f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
108842f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1089*65282e9fSSrikanth Yalavarthi 		__int64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point);
109042f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
109142f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
109242f3dcd9SSrikanth Yalavarthi 	}
109342f3dcd9SSrikanth Yalavarthi 
109442f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
109542f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
109642f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1097*65282e9fSSrikanth Yalavarthi 		__int64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
109842f3dcd9SSrikanth Yalavarthi 		input_buffer++;
109942f3dcd9SSrikanth Yalavarthi 		output_buffer++;
110042f3dcd9SSrikanth Yalavarthi 	}
110142f3dcd9SSrikanth Yalavarthi 
110242f3dcd9SSrikanth Yalavarthi 	return 0;
110342f3dcd9SSrikanth Yalavarthi }
110442f3dcd9SSrikanth Yalavarthi 
110542f3dcd9SSrikanth Yalavarthi static inline void
1106*65282e9fSSrikanth Yalavarthi __float32_to_uint64_neon_u64x2(const float *input, uint64_t *output, float scale,
1107*65282e9fSSrikanth Yalavarthi 			       uint64_t zero_point)
110842f3dcd9SSrikanth Yalavarthi {
110942f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
111042f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
111142f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
111242f3dcd9SSrikanth Yalavarthi 
111342f3dcd9SSrikanth Yalavarthi 	/* load 2 x float elements */
111442f3dcd9SSrikanth Yalavarthi 	f32x2 = vld1_f32(input);
111542f3dcd9SSrikanth Yalavarthi 
111642f3dcd9SSrikanth Yalavarthi 	/* scale */
1117*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
1118*65282e9fSSrikanth Yalavarthi 
1119*65282e9fSSrikanth Yalavarthi 	/* add zero point */
1120*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
112142f3dcd9SSrikanth Yalavarthi 
112242f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
112342f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
112442f3dcd9SSrikanth Yalavarthi 
112542f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
112642f3dcd9SSrikanth Yalavarthi 	u64x2 = vcvtaq_u64_f64(f64x2);
112742f3dcd9SSrikanth Yalavarthi 
112842f3dcd9SSrikanth Yalavarthi 	/* store 2 elements */
112942f3dcd9SSrikanth Yalavarthi 	vst1q_u64(output, u64x2);
113042f3dcd9SSrikanth Yalavarthi }
113142f3dcd9SSrikanth Yalavarthi 
113242f3dcd9SSrikanth Yalavarthi static inline void
1133*65282e9fSSrikanth Yalavarthi __float32_to_uint64_neon_u64x1(const float *input, uint64_t *output, float scale,
1134*65282e9fSSrikanth Yalavarthi 			       uint64_t zero_point)
113542f3dcd9SSrikanth Yalavarthi {
113642f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
113742f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
113842f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
113942f3dcd9SSrikanth Yalavarthi 
114042f3dcd9SSrikanth Yalavarthi 	/* load 1 x float element */
114142f3dcd9SSrikanth Yalavarthi 	f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0);
114242f3dcd9SSrikanth Yalavarthi 
114342f3dcd9SSrikanth Yalavarthi 	/* scale */
1144*65282e9fSSrikanth Yalavarthi 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
1145*65282e9fSSrikanth Yalavarthi 
1146*65282e9fSSrikanth Yalavarthi 	/* add zero_point */
1147*65282e9fSSrikanth Yalavarthi 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
114842f3dcd9SSrikanth Yalavarthi 
114942f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
115042f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
115142f3dcd9SSrikanth Yalavarthi 
115242f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
115342f3dcd9SSrikanth Yalavarthi 	u64x2 = vcvtaq_u64_f64(f64x2);
115442f3dcd9SSrikanth Yalavarthi 
115542f3dcd9SSrikanth Yalavarthi 	/* store 2 elements */
115642f3dcd9SSrikanth Yalavarthi 	vst1q_lane_u64(output, u64x2, 0);
115742f3dcd9SSrikanth Yalavarthi }
115842f3dcd9SSrikanth Yalavarthi 
115942f3dcd9SSrikanth Yalavarthi int
1160*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale,
1161*65282e9fSSrikanth Yalavarthi 			   uint64_t zero_point)
116242f3dcd9SSrikanth Yalavarthi {
1163*65282e9fSSrikanth Yalavarthi 	const float *input_buffer;
116442f3dcd9SSrikanth Yalavarthi 	uint64_t *output_buffer;
116542f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
116642f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
116742f3dcd9SSrikanth Yalavarthi 	uint64_t i;
116842f3dcd9SSrikanth Yalavarthi 
116942f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
117042f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
117142f3dcd9SSrikanth Yalavarthi 
1172*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float *)input;
117342f3dcd9SSrikanth Yalavarthi 	output_buffer = (uint64_t *)output;
117442f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
117542f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
117642f3dcd9SSrikanth Yalavarthi 
117742f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
117842f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1179*65282e9fSSrikanth Yalavarthi 		__float32_to_uint64_neon_u64x2(input_buffer, output_buffer, scale, zero_point);
118042f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
118142f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
118242f3dcd9SSrikanth Yalavarthi 	}
118342f3dcd9SSrikanth Yalavarthi 
118442f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
118542f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
118642f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1187*65282e9fSSrikanth Yalavarthi 		__float32_to_uint64_neon_u64x1(input_buffer, output_buffer, scale, zero_point);
118842f3dcd9SSrikanth Yalavarthi 		input_buffer++;
118942f3dcd9SSrikanth Yalavarthi 		output_buffer++;
119042f3dcd9SSrikanth Yalavarthi 	}
119142f3dcd9SSrikanth Yalavarthi 
119242f3dcd9SSrikanth Yalavarthi 	return 0;
119342f3dcd9SSrikanth Yalavarthi }
119442f3dcd9SSrikanth Yalavarthi 
119542f3dcd9SSrikanth Yalavarthi static inline void
1196*65282e9fSSrikanth Yalavarthi __uint64_to_float32_neon_f32x2(const uint64_t *input, float *output, float scale,
1197*65282e9fSSrikanth Yalavarthi 			       uint64_t zero_point)
119842f3dcd9SSrikanth Yalavarthi {
119942f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
120042f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
120142f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
120242f3dcd9SSrikanth Yalavarthi 
120342f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
120442f3dcd9SSrikanth Yalavarthi 	u64x2 = vld1q_u64(input);
120542f3dcd9SSrikanth Yalavarthi 
120642f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
120742f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_u64(u64x2);
120842f3dcd9SSrikanth Yalavarthi 
120942f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
121042f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
121142f3dcd9SSrikanth Yalavarthi 
1212*65282e9fSSrikanth Yalavarthi 	/* subtract zero_point */
1213*65282e9fSSrikanth Yalavarthi 	f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point));
1214*65282e9fSSrikanth Yalavarthi 
121542f3dcd9SSrikanth Yalavarthi 	/* scale */
121642f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
121742f3dcd9SSrikanth Yalavarthi 
121842f3dcd9SSrikanth Yalavarthi 	/* store float32x2_t */
121942f3dcd9SSrikanth Yalavarthi 	vst1_f32(output, f32x2);
122042f3dcd9SSrikanth Yalavarthi }
122142f3dcd9SSrikanth Yalavarthi 
122242f3dcd9SSrikanth Yalavarthi static inline void
1223*65282e9fSSrikanth Yalavarthi __uint64_to_float32_neon_f32x1(const uint64_t *input, float *output, float scale,
1224*65282e9fSSrikanth Yalavarthi 			       uint64_t zero_point)
122542f3dcd9SSrikanth Yalavarthi {
122642f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
122742f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
122842f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
122942f3dcd9SSrikanth Yalavarthi 
123042f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
123142f3dcd9SSrikanth Yalavarthi 	u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0);
123242f3dcd9SSrikanth Yalavarthi 
123342f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
123442f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_u64(u64x2);
123542f3dcd9SSrikanth Yalavarthi 
123642f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
123742f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
123842f3dcd9SSrikanth Yalavarthi 
1239*65282e9fSSrikanth Yalavarthi 	/* subtract zero_point */
1240*65282e9fSSrikanth Yalavarthi 	f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point));
1241*65282e9fSSrikanth Yalavarthi 
124242f3dcd9SSrikanth Yalavarthi 	/* scale */
124342f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
124442f3dcd9SSrikanth Yalavarthi 
1245*65282e9fSSrikanth Yalavarthi 	/* store float32x2_t lane 0 */
124642f3dcd9SSrikanth Yalavarthi 	vst1_lane_f32(output, f32x2, 0);
124742f3dcd9SSrikanth Yalavarthi }
124842f3dcd9SSrikanth Yalavarthi 
124942f3dcd9SSrikanth Yalavarthi int
1250*65282e9fSSrikanth Yalavarthi rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
1251*65282e9fSSrikanth Yalavarthi 			   uint64_t zero_point)
125242f3dcd9SSrikanth Yalavarthi {
1253*65282e9fSSrikanth Yalavarthi 	const uint64_t *input_buffer;
125442f3dcd9SSrikanth Yalavarthi 	float *output_buffer;
125542f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
125642f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
125742f3dcd9SSrikanth Yalavarthi 	uint64_t i;
125842f3dcd9SSrikanth Yalavarthi 
125942f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
126042f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
126142f3dcd9SSrikanth Yalavarthi 
1262*65282e9fSSrikanth Yalavarthi 	input_buffer = (const uint64_t *)input;
126342f3dcd9SSrikanth Yalavarthi 	output_buffer = (float *)output;
126442f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
126542f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
126642f3dcd9SSrikanth Yalavarthi 
126742f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
126842f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1269*65282e9fSSrikanth Yalavarthi 		__uint64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point);
127042f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
127142f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
127242f3dcd9SSrikanth Yalavarthi 	}
127342f3dcd9SSrikanth Yalavarthi 
127442f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
127542f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
127642f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1277*65282e9fSSrikanth Yalavarthi 		__uint64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
127842f3dcd9SSrikanth Yalavarthi 		input_buffer++;
127942f3dcd9SSrikanth Yalavarthi 		output_buffer++;
128042f3dcd9SSrikanth Yalavarthi 	}
128142f3dcd9SSrikanth Yalavarthi 
128242f3dcd9SSrikanth Yalavarthi 	return 0;
128342f3dcd9SSrikanth Yalavarthi }
128442f3dcd9SSrikanth Yalavarthi 
128542f3dcd9SSrikanth Yalavarthi static inline void
1286*65282e9fSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(const float32_t *input, float16_t *output)
1287fc54766bSSrikanth Yalavarthi {
1288fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1289fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1290fc54766bSSrikanth Yalavarthi 
1291fc54766bSSrikanth Yalavarthi 	/* load 4 x float32_t elements */
1292fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
1293fc54766bSSrikanth Yalavarthi 
1294fc54766bSSrikanth Yalavarthi 	/* convert to float16x4_t */
1295fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
1296fc54766bSSrikanth Yalavarthi 
1297fc54766bSSrikanth Yalavarthi 	/* store float16x4_t */
1298fc54766bSSrikanth Yalavarthi 	vst1_f16(output, f16x4);
1299fc54766bSSrikanth Yalavarthi }
1300fc54766bSSrikanth Yalavarthi 
1301fc54766bSSrikanth Yalavarthi static inline void
1302*65282e9fSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(const float32_t *input, float16_t *output)
1303fc54766bSSrikanth Yalavarthi {
1304fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1305fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1306fc54766bSSrikanth Yalavarthi 
1307fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
1308fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_dup_f32(input);
1309fc54766bSSrikanth Yalavarthi 
1310fc54766bSSrikanth Yalavarthi 	/* convert float32_t to float16_t */
1311fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
1312fc54766bSSrikanth Yalavarthi 
1313fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
1314fc54766bSSrikanth Yalavarthi 	vst1_lane_f16(output, f16x4, 0);
1315fc54766bSSrikanth Yalavarthi }
1316fc54766bSSrikanth Yalavarthi 
1317fc54766bSSrikanth Yalavarthi int
1318*65282e9fSSrikanth Yalavarthi rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements)
1319fc54766bSSrikanth Yalavarthi {
1320*65282e9fSSrikanth Yalavarthi 	const float32_t *input_buffer;
1321fc54766bSSrikanth Yalavarthi 	float16_t *output_buffer;
1322fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
1323fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
1324fc54766bSSrikanth Yalavarthi 	uint64_t i;
1325fc54766bSSrikanth Yalavarthi 
1326fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1327fc54766bSSrikanth Yalavarthi 		return -EINVAL;
1328fc54766bSSrikanth Yalavarthi 
1329*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float32_t *)input;
1330fc54766bSSrikanth Yalavarthi 	output_buffer = (float16_t *)output;
1331fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1332fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
1333fc54766bSSrikanth Yalavarthi 
1334fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
1335fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1336fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
1337fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
1338fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
1339fc54766bSSrikanth Yalavarthi 	}
1340fc54766bSSrikanth Yalavarthi 
1341fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
1342fc54766bSSrikanth Yalavarthi 	i = i * vlen;
1343fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1344fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
1345fc54766bSSrikanth Yalavarthi 		input_buffer++;
1346fc54766bSSrikanth Yalavarthi 		output_buffer++;
1347fc54766bSSrikanth Yalavarthi 	}
1348fc54766bSSrikanth Yalavarthi 
1349fc54766bSSrikanth Yalavarthi 	return 0;
1350fc54766bSSrikanth Yalavarthi }
1351fc54766bSSrikanth Yalavarthi 
1352fc54766bSSrikanth Yalavarthi static inline void
1353*65282e9fSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(const float16_t *input, float32_t *output)
1354fc54766bSSrikanth Yalavarthi {
1355fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1356fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1357fc54766bSSrikanth Yalavarthi 
1358fc54766bSSrikanth Yalavarthi 	/* load 4 x float16_t elements */
1359fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_f16(input);
1360fc54766bSSrikanth Yalavarthi 
1361fc54766bSSrikanth Yalavarthi 	/* convert float16x4_t to float32x4_t */
1362fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
1363fc54766bSSrikanth Yalavarthi 
1364fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
1365fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
1366fc54766bSSrikanth Yalavarthi }
1367fc54766bSSrikanth Yalavarthi 
1368fc54766bSSrikanth Yalavarthi static inline void
1369*65282e9fSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(const float16_t *input, float32_t *output)
1370fc54766bSSrikanth Yalavarthi {
1371fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1372fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1373fc54766bSSrikanth Yalavarthi 
1374fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
1375fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_dup_f16(input);
1376fc54766bSSrikanth Yalavarthi 
1377fc54766bSSrikanth Yalavarthi 	/* convert float16_t to float32_t */
1378fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
1379fc54766bSSrikanth Yalavarthi 
1380fc54766bSSrikanth Yalavarthi 	/* store 1 element */
1381fc54766bSSrikanth Yalavarthi 	vst1q_lane_f32(output, f32x4, 0);
1382fc54766bSSrikanth Yalavarthi }
1383fc54766bSSrikanth Yalavarthi 
1384fc54766bSSrikanth Yalavarthi int
1385*65282e9fSSrikanth Yalavarthi rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements)
1386fc54766bSSrikanth Yalavarthi {
1387*65282e9fSSrikanth Yalavarthi 	const float16_t *input_buffer;
1388fc54766bSSrikanth Yalavarthi 	float32_t *output_buffer;
1389fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
1390fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
1391fc54766bSSrikanth Yalavarthi 	uint64_t i;
1392fc54766bSSrikanth Yalavarthi 
1393fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1394fc54766bSSrikanth Yalavarthi 		return -EINVAL;
1395fc54766bSSrikanth Yalavarthi 
1396*65282e9fSSrikanth Yalavarthi 	input_buffer = (const float16_t *)input;
1397fc54766bSSrikanth Yalavarthi 	output_buffer = (float32_t *)output;
1398fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1399fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
1400fc54766bSSrikanth Yalavarthi 
1401fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
1402fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1403fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
1404fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
1405fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
1406fc54766bSSrikanth Yalavarthi 	}
1407fc54766bSSrikanth Yalavarthi 
1408fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
1409fc54766bSSrikanth Yalavarthi 	i = i * vlen;
1410fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1411fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
1412fc54766bSSrikanth Yalavarthi 		input_buffer++;
1413fc54766bSSrikanth Yalavarthi 		output_buffer++;
1414fc54766bSSrikanth Yalavarthi 	}
1415fc54766bSSrikanth Yalavarthi 
1416fc54766bSSrikanth Yalavarthi 	return 0;
1417fc54766bSSrikanth Yalavarthi }
1418