xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision 42f3dcd9713694e312cbc709f7cb3d943e78e6ea)
1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause
2fc54766bSSrikanth Yalavarthi  * Copyright (c) 2022 Marvell.
3fc54766bSSrikanth Yalavarthi  */
4fc54766bSSrikanth Yalavarthi 
5fc54766bSSrikanth Yalavarthi #include <errno.h>
6fc54766bSSrikanth Yalavarthi #include <stdint.h>
7fc54766bSSrikanth Yalavarthi #include <stdlib.h>
8fc54766bSSrikanth Yalavarthi 
9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h"
10fc54766bSSrikanth Yalavarthi 
11fc54766bSSrikanth Yalavarthi #include <arm_neon.h>
12fc54766bSSrikanth Yalavarthi 
13fc54766bSSrikanth Yalavarthi /* Description:
14fc54766bSSrikanth Yalavarthi  * This file implements vector versions of Machine Learning utility functions used to convert data
15538f6997SSrikanth Yalavarthi  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16538f6997SSrikanth Yalavarthi  * is based on Arm Neon intrinsics.
17fc54766bSSrikanth Yalavarthi  */
18fc54766bSSrikanth Yalavarthi 
19fc54766bSSrikanth Yalavarthi static inline void
20fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
21fc54766bSSrikanth Yalavarthi {
22fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_l;
23fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_h;
24fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
25fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
26fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
27fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
28fc54766bSSrikanth Yalavarthi 
29fc54766bSSrikanth Yalavarthi 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
31fc54766bSSrikanth Yalavarthi 	 */
32fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
33fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
34fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
35fc54766bSSrikanth Yalavarthi 	s16x4_l = vqmovn_s32(s32x4);
36fc54766bSSrikanth Yalavarthi 
37fc54766bSSrikanth Yalavarthi 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
38fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
39fc54766bSSrikanth Yalavarthi 	 */
40fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
41fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
42fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
43fc54766bSSrikanth Yalavarthi 	s16x4_h = vqmovn_s32(s32x4);
44fc54766bSSrikanth Yalavarthi 
45fc54766bSSrikanth Yalavarthi 	/* combine lower and higher int16x4_t to int16x8_t */
46fc54766bSSrikanth Yalavarthi 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
47fc54766bSSrikanth Yalavarthi 
48fc54766bSSrikanth Yalavarthi 	/* narrow to int8_t */
49fc54766bSSrikanth Yalavarthi 	s8x8 = vqmovn_s16(s16x8);
50fc54766bSSrikanth Yalavarthi 
51fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
52fc54766bSSrikanth Yalavarthi 	vst1_s8(output, s8x8);
53fc54766bSSrikanth Yalavarthi }
54fc54766bSSrikanth Yalavarthi 
55fc54766bSSrikanth Yalavarthi static inline void
56fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
57fc54766bSSrikanth Yalavarthi {
58fc54766bSSrikanth Yalavarthi 	int32_t s32;
59fc54766bSSrikanth Yalavarthi 	int16_t s16;
60fc54766bSSrikanth Yalavarthi 
61fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
62fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
63fc54766bSSrikanth Yalavarthi 
64fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
65fc54766bSSrikanth Yalavarthi 	s16 = vqmovns_s32(s32);
66fc54766bSSrikanth Yalavarthi 
67fc54766bSSrikanth Yalavarthi 	/* convert to int8_t */
68fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_s16(s16);
69fc54766bSSrikanth Yalavarthi }
70fc54766bSSrikanth Yalavarthi 
71fc54766bSSrikanth Yalavarthi int
72fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
73fc54766bSSrikanth Yalavarthi {
74fc54766bSSrikanth Yalavarthi 	float *input_buffer;
75fc54766bSSrikanth Yalavarthi 	int8_t *output_buffer;
76fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
77fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
78fc54766bSSrikanth Yalavarthi 	uint64_t i;
79fc54766bSSrikanth Yalavarthi 
80fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
81fc54766bSSrikanth Yalavarthi 		return -EINVAL;
82fc54766bSSrikanth Yalavarthi 
83fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
84fc54766bSSrikanth Yalavarthi 	output_buffer = (int8_t *)output;
85fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
86fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
87fc54766bSSrikanth Yalavarthi 
88fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
89fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
90fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
91fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
92fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
93fc54766bSSrikanth Yalavarthi 	}
94fc54766bSSrikanth Yalavarthi 
95fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
96fc54766bSSrikanth Yalavarthi 	i = i * vlen;
97fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
98fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
99fc54766bSSrikanth Yalavarthi 		input_buffer++;
100fc54766bSSrikanth Yalavarthi 		output_buffer++;
101fc54766bSSrikanth Yalavarthi 	}
102fc54766bSSrikanth Yalavarthi 
103fc54766bSSrikanth Yalavarthi 	return 0;
104fc54766bSSrikanth Yalavarthi }
105fc54766bSSrikanth Yalavarthi 
106fc54766bSSrikanth Yalavarthi static inline void
107fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
108fc54766bSSrikanth Yalavarthi {
109fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
110fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
111fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
112fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
113fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
114fc54766bSSrikanth Yalavarthi 
115fc54766bSSrikanth Yalavarthi 	/* load 8 x int8_t elements */
116fc54766bSSrikanth Yalavarthi 	s8x8 = vld1_s8(input);
117fc54766bSSrikanth Yalavarthi 
118fc54766bSSrikanth Yalavarthi 	/* widen int8_t to int16_t */
119fc54766bSSrikanth Yalavarthi 	s16x8 = vmovl_s8(s8x8);
120fc54766bSSrikanth Yalavarthi 
121fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
122fc54766bSSrikanth Yalavarthi 	s16x4 = vget_low_s16(s16x8);
123fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
124fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
125fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
126fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
127fc54766bSSrikanth Yalavarthi 
128fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
129fc54766bSSrikanth Yalavarthi 	s16x4 = vget_high_s16(s16x8);
130fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
131fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
132fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
133fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
134fc54766bSSrikanth Yalavarthi }
135fc54766bSSrikanth Yalavarthi 
136fc54766bSSrikanth Yalavarthi static inline void
137fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
138fc54766bSSrikanth Yalavarthi {
139fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
140fc54766bSSrikanth Yalavarthi }
141fc54766bSSrikanth Yalavarthi 
142fc54766bSSrikanth Yalavarthi int
143fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
144fc54766bSSrikanth Yalavarthi {
145fc54766bSSrikanth Yalavarthi 	int8_t *input_buffer;
146fc54766bSSrikanth Yalavarthi 	float *output_buffer;
147fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
148fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
149fc54766bSSrikanth Yalavarthi 	uint64_t i;
150fc54766bSSrikanth Yalavarthi 
151fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
152fc54766bSSrikanth Yalavarthi 		return -EINVAL;
153fc54766bSSrikanth Yalavarthi 
154fc54766bSSrikanth Yalavarthi 	input_buffer = (int8_t *)input;
155fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
156fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
157fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
158fc54766bSSrikanth Yalavarthi 
159fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
160fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
161fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
162fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
163fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
164fc54766bSSrikanth Yalavarthi 	}
165fc54766bSSrikanth Yalavarthi 
166fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
167fc54766bSSrikanth Yalavarthi 	i = i * vlen;
168fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
169fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
170fc54766bSSrikanth Yalavarthi 		input_buffer++;
171fc54766bSSrikanth Yalavarthi 		output_buffer++;
172fc54766bSSrikanth Yalavarthi 	}
173fc54766bSSrikanth Yalavarthi 
174fc54766bSSrikanth Yalavarthi 	return 0;
175fc54766bSSrikanth Yalavarthi }
176fc54766bSSrikanth Yalavarthi 
177fc54766bSSrikanth Yalavarthi static inline void
178fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
179fc54766bSSrikanth Yalavarthi {
180fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_l;
181fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_h;
182fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
183fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
184fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
185fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
186fc54766bSSrikanth Yalavarthi 
187fc54766bSSrikanth Yalavarthi 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
188fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
189fc54766bSSrikanth Yalavarthi 	 */
190fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
191fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
192fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
193fc54766bSSrikanth Yalavarthi 	u16x4_l = vqmovn_u32(u32x4);
194fc54766bSSrikanth Yalavarthi 
195fc54766bSSrikanth Yalavarthi 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
196fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
197fc54766bSSrikanth Yalavarthi 	 */
198fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
199fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
200fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
201fc54766bSSrikanth Yalavarthi 	u16x4_h = vqmovn_u32(u32x4);
202fc54766bSSrikanth Yalavarthi 
203fc54766bSSrikanth Yalavarthi 	/* combine lower and higher uint16x4_t */
204fc54766bSSrikanth Yalavarthi 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
205fc54766bSSrikanth Yalavarthi 
206fc54766bSSrikanth Yalavarthi 	/* narrow to uint8x8_t */
207fc54766bSSrikanth Yalavarthi 	u8x8 = vqmovn_u16(u16x8);
208fc54766bSSrikanth Yalavarthi 
209fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
210fc54766bSSrikanth Yalavarthi 	vst1_u8(output, u8x8);
211fc54766bSSrikanth Yalavarthi }
212fc54766bSSrikanth Yalavarthi 
213fc54766bSSrikanth Yalavarthi static inline void
214fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
215fc54766bSSrikanth Yalavarthi {
216fc54766bSSrikanth Yalavarthi 	uint32_t u32;
217fc54766bSSrikanth Yalavarthi 	uint16_t u16;
218fc54766bSSrikanth Yalavarthi 
219fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
220fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
221fc54766bSSrikanth Yalavarthi 
222fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
223fc54766bSSrikanth Yalavarthi 	u16 = vqmovns_u32(u32);
224fc54766bSSrikanth Yalavarthi 
225fc54766bSSrikanth Yalavarthi 	/* convert to uint8_t */
226fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_u16(u16);
227fc54766bSSrikanth Yalavarthi }
228fc54766bSSrikanth Yalavarthi 
229fc54766bSSrikanth Yalavarthi int
230fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
231fc54766bSSrikanth Yalavarthi {
232fc54766bSSrikanth Yalavarthi 	float *input_buffer;
233fc54766bSSrikanth Yalavarthi 	uint8_t *output_buffer;
234fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
235fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
236fc54766bSSrikanth Yalavarthi 	uint64_t i;
237fc54766bSSrikanth Yalavarthi 
238fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
239fc54766bSSrikanth Yalavarthi 		return -EINVAL;
240fc54766bSSrikanth Yalavarthi 
241fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
242fc54766bSSrikanth Yalavarthi 	output_buffer = (uint8_t *)output;
243fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
244fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
245fc54766bSSrikanth Yalavarthi 
246fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
247fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
248fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
249fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
250fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
251fc54766bSSrikanth Yalavarthi 	}
252fc54766bSSrikanth Yalavarthi 
253fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
254fc54766bSSrikanth Yalavarthi 	i = i * vlen;
255fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
256fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
257fc54766bSSrikanth Yalavarthi 		input_buffer++;
258fc54766bSSrikanth Yalavarthi 		output_buffer++;
259fc54766bSSrikanth Yalavarthi 	}
260fc54766bSSrikanth Yalavarthi 
261fc54766bSSrikanth Yalavarthi 	return 0;
262fc54766bSSrikanth Yalavarthi }
263fc54766bSSrikanth Yalavarthi 
264fc54766bSSrikanth Yalavarthi static inline void
265fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
266fc54766bSSrikanth Yalavarthi {
267fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
268fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
269fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
270fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
271fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
272fc54766bSSrikanth Yalavarthi 
273fc54766bSSrikanth Yalavarthi 	/* load 8 x uint8_t elements */
274fc54766bSSrikanth Yalavarthi 	u8x8 = vld1_u8(input);
275fc54766bSSrikanth Yalavarthi 
276fc54766bSSrikanth Yalavarthi 	/* widen uint8_t to uint16_t */
277fc54766bSSrikanth Yalavarthi 	u16x8 = vmovl_u8(u8x8);
278fc54766bSSrikanth Yalavarthi 
279fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
280fc54766bSSrikanth Yalavarthi 	u16x4 = vget_low_u16(u16x8);
281fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
282fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
283fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
284fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
285fc54766bSSrikanth Yalavarthi 
286fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
287fc54766bSSrikanth Yalavarthi 	u16x4 = vget_high_u16(u16x8);
288fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
289fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
290fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
291fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
292fc54766bSSrikanth Yalavarthi }
293fc54766bSSrikanth Yalavarthi 
294fc54766bSSrikanth Yalavarthi static inline void
295fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
296fc54766bSSrikanth Yalavarthi {
297fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
298fc54766bSSrikanth Yalavarthi }
299fc54766bSSrikanth Yalavarthi 
300fc54766bSSrikanth Yalavarthi int
301fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
302fc54766bSSrikanth Yalavarthi {
303fc54766bSSrikanth Yalavarthi 	uint8_t *input_buffer;
304fc54766bSSrikanth Yalavarthi 	float *output_buffer;
305fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
306fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
307fc54766bSSrikanth Yalavarthi 	uint64_t i;
308fc54766bSSrikanth Yalavarthi 
309fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
310fc54766bSSrikanth Yalavarthi 		return -EINVAL;
311fc54766bSSrikanth Yalavarthi 
312fc54766bSSrikanth Yalavarthi 	input_buffer = (uint8_t *)input;
313fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
314fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
315fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
316fc54766bSSrikanth Yalavarthi 
317fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
318fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
319fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
320fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
321fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
322fc54766bSSrikanth Yalavarthi 	}
323fc54766bSSrikanth Yalavarthi 
324fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
325fc54766bSSrikanth Yalavarthi 	i = i * vlen;
326fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
327fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
328fc54766bSSrikanth Yalavarthi 		input_buffer++;
329fc54766bSSrikanth Yalavarthi 		output_buffer++;
330fc54766bSSrikanth Yalavarthi 	}
331fc54766bSSrikanth Yalavarthi 
332fc54766bSSrikanth Yalavarthi 	return 0;
333fc54766bSSrikanth Yalavarthi }
334fc54766bSSrikanth Yalavarthi 
335fc54766bSSrikanth Yalavarthi static inline void
336fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
337fc54766bSSrikanth Yalavarthi {
338fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
339fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
340fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
341fc54766bSSrikanth Yalavarthi 
342fc54766bSSrikanth Yalavarthi 	/* load 4 x float elements */
343fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
344fc54766bSSrikanth Yalavarthi 
345fc54766bSSrikanth Yalavarthi 	/* scale */
346fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
347fc54766bSSrikanth Yalavarthi 
348fc54766bSSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
349fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
350fc54766bSSrikanth Yalavarthi 
351fc54766bSSrikanth Yalavarthi 	/* saturate narrow to int16x4_t */
352fc54766bSSrikanth Yalavarthi 	s16x4 = vqmovn_s32(s32x4);
353fc54766bSSrikanth Yalavarthi 
354fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
355fc54766bSSrikanth Yalavarthi 	vst1_s16(output, s16x4);
356fc54766bSSrikanth Yalavarthi }
357fc54766bSSrikanth Yalavarthi 
358fc54766bSSrikanth Yalavarthi static inline void
359fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
360fc54766bSSrikanth Yalavarthi {
361fc54766bSSrikanth Yalavarthi 	int32_t s32;
362fc54766bSSrikanth Yalavarthi 
363fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
364fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
365fc54766bSSrikanth Yalavarthi 
366fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
367fc54766bSSrikanth Yalavarthi 	*output = vqmovns_s32(s32);
368fc54766bSSrikanth Yalavarthi }
369fc54766bSSrikanth Yalavarthi 
370fc54766bSSrikanth Yalavarthi int
371fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
372fc54766bSSrikanth Yalavarthi {
373fc54766bSSrikanth Yalavarthi 	float *input_buffer;
374fc54766bSSrikanth Yalavarthi 	int16_t *output_buffer;
375fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
376fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
377fc54766bSSrikanth Yalavarthi 	uint64_t i;
378fc54766bSSrikanth Yalavarthi 
379fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
380fc54766bSSrikanth Yalavarthi 		return -EINVAL;
381fc54766bSSrikanth Yalavarthi 
382fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
383fc54766bSSrikanth Yalavarthi 	output_buffer = (int16_t *)output;
384fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
385fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
386fc54766bSSrikanth Yalavarthi 
387fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
388fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
389fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
390fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
391fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
392fc54766bSSrikanth Yalavarthi 	}
393fc54766bSSrikanth Yalavarthi 
394fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
395fc54766bSSrikanth Yalavarthi 	i = i * vlen;
396fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
397fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
398fc54766bSSrikanth Yalavarthi 		input_buffer++;
399fc54766bSSrikanth Yalavarthi 		output_buffer++;
400fc54766bSSrikanth Yalavarthi 	}
401fc54766bSSrikanth Yalavarthi 
402fc54766bSSrikanth Yalavarthi 	return 0;
403fc54766bSSrikanth Yalavarthi }
404fc54766bSSrikanth Yalavarthi 
405fc54766bSSrikanth Yalavarthi static inline void
406fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
407fc54766bSSrikanth Yalavarthi {
408fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
409fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
410fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
411fc54766bSSrikanth Yalavarthi 
412fc54766bSSrikanth Yalavarthi 	/* load 4 x int16_t elements */
413fc54766bSSrikanth Yalavarthi 	s16x4 = vld1_s16(input);
414fc54766bSSrikanth Yalavarthi 
415fc54766bSSrikanth Yalavarthi 	/* widen int16_t to int32_t */
416fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
417fc54766bSSrikanth Yalavarthi 
418fc54766bSSrikanth Yalavarthi 	/* convert int32_t to float */
419fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
420fc54766bSSrikanth Yalavarthi 
421fc54766bSSrikanth Yalavarthi 	/* scale */
422fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
423fc54766bSSrikanth Yalavarthi 
424fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
425fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
426fc54766bSSrikanth Yalavarthi }
427fc54766bSSrikanth Yalavarthi 
428fc54766bSSrikanth Yalavarthi static inline void
429fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
430fc54766bSSrikanth Yalavarthi {
431fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
432fc54766bSSrikanth Yalavarthi }
433fc54766bSSrikanth Yalavarthi 
434fc54766bSSrikanth Yalavarthi int
435fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
436fc54766bSSrikanth Yalavarthi {
437fc54766bSSrikanth Yalavarthi 	int16_t *input_buffer;
438fc54766bSSrikanth Yalavarthi 	float *output_buffer;
439fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
440fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
441fc54766bSSrikanth Yalavarthi 	uint64_t i;
442fc54766bSSrikanth Yalavarthi 
443fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
444fc54766bSSrikanth Yalavarthi 		return -EINVAL;
445fc54766bSSrikanth Yalavarthi 
446fc54766bSSrikanth Yalavarthi 	input_buffer = (int16_t *)input;
447fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
448fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
449fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
450fc54766bSSrikanth Yalavarthi 
451fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
452fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
453fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
454fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
455fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
456fc54766bSSrikanth Yalavarthi 	}
457fc54766bSSrikanth Yalavarthi 
458fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
459fc54766bSSrikanth Yalavarthi 	i = i * vlen;
460fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
461fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
462fc54766bSSrikanth Yalavarthi 		input_buffer++;
463fc54766bSSrikanth Yalavarthi 		output_buffer++;
464fc54766bSSrikanth Yalavarthi 	}
465fc54766bSSrikanth Yalavarthi 
466fc54766bSSrikanth Yalavarthi 	return 0;
467fc54766bSSrikanth Yalavarthi }
468fc54766bSSrikanth Yalavarthi 
469fc54766bSSrikanth Yalavarthi static inline void
470fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
471fc54766bSSrikanth Yalavarthi {
472fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
473fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
474fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
475fc54766bSSrikanth Yalavarthi 
476fc54766bSSrikanth Yalavarthi 	/* load 4 float elements */
477fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
478fc54766bSSrikanth Yalavarthi 
479fc54766bSSrikanth Yalavarthi 	/* scale */
480fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
481fc54766bSSrikanth Yalavarthi 
482fc54766bSSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
483fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
484fc54766bSSrikanth Yalavarthi 
485fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
486fc54766bSSrikanth Yalavarthi 	u16x4 = vqmovn_u32(u32x4);
487fc54766bSSrikanth Yalavarthi 
488fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
489fc54766bSSrikanth Yalavarthi 	vst1_u16(output, u16x4);
490fc54766bSSrikanth Yalavarthi }
491fc54766bSSrikanth Yalavarthi 
492fc54766bSSrikanth Yalavarthi static inline void
493fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
494fc54766bSSrikanth Yalavarthi {
495fc54766bSSrikanth Yalavarthi 	uint32_t u32;
496fc54766bSSrikanth Yalavarthi 
497fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
498fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
499fc54766bSSrikanth Yalavarthi 
500fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
501fc54766bSSrikanth Yalavarthi 	*output = vqmovns_u32(u32);
502fc54766bSSrikanth Yalavarthi }
503fc54766bSSrikanth Yalavarthi 
504fc54766bSSrikanth Yalavarthi int
505fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
506fc54766bSSrikanth Yalavarthi {
507fc54766bSSrikanth Yalavarthi 	float *input_buffer;
508fc54766bSSrikanth Yalavarthi 	uint16_t *output_buffer;
509fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
510fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
511fc54766bSSrikanth Yalavarthi 	uint64_t i;
512fc54766bSSrikanth Yalavarthi 
513fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
514fc54766bSSrikanth Yalavarthi 		return -EINVAL;
515fc54766bSSrikanth Yalavarthi 
516fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
517fc54766bSSrikanth Yalavarthi 	output_buffer = (uint16_t *)output;
518fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
519fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
520fc54766bSSrikanth Yalavarthi 
521fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
522fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
523fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
524fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
525fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
526fc54766bSSrikanth Yalavarthi 	}
527fc54766bSSrikanth Yalavarthi 
528fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
529fc54766bSSrikanth Yalavarthi 	i = i * vlen;
530fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
531fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
532fc54766bSSrikanth Yalavarthi 		input_buffer++;
533fc54766bSSrikanth Yalavarthi 		output_buffer++;
534fc54766bSSrikanth Yalavarthi 	}
535fc54766bSSrikanth Yalavarthi 
536fc54766bSSrikanth Yalavarthi 	return 0;
537fc54766bSSrikanth Yalavarthi }
538fc54766bSSrikanth Yalavarthi 
539fc54766bSSrikanth Yalavarthi static inline void
540fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
541fc54766bSSrikanth Yalavarthi {
542fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
543fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
544fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
545fc54766bSSrikanth Yalavarthi 
546fc54766bSSrikanth Yalavarthi 	/* load 4 x uint16_t elements */
547fc54766bSSrikanth Yalavarthi 	u16x4 = vld1_u16(input);
548fc54766bSSrikanth Yalavarthi 
549fc54766bSSrikanth Yalavarthi 	/* widen uint16_t to uint32_t */
550fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
551fc54766bSSrikanth Yalavarthi 
552fc54766bSSrikanth Yalavarthi 	/* convert uint32_t to float */
553fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
554fc54766bSSrikanth Yalavarthi 
555fc54766bSSrikanth Yalavarthi 	/* scale */
556fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
557fc54766bSSrikanth Yalavarthi 
558fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
559fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
560fc54766bSSrikanth Yalavarthi }
561fc54766bSSrikanth Yalavarthi 
562fc54766bSSrikanth Yalavarthi static inline void
563fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
564fc54766bSSrikanth Yalavarthi {
565fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
566fc54766bSSrikanth Yalavarthi }
567fc54766bSSrikanth Yalavarthi 
568fc54766bSSrikanth Yalavarthi int
569fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
570fc54766bSSrikanth Yalavarthi {
571fc54766bSSrikanth Yalavarthi 	uint16_t *input_buffer;
572fc54766bSSrikanth Yalavarthi 	float *output_buffer;
573fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
574fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
575fc54766bSSrikanth Yalavarthi 	uint64_t i;
576fc54766bSSrikanth Yalavarthi 
577fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
578fc54766bSSrikanth Yalavarthi 		return -EINVAL;
579fc54766bSSrikanth Yalavarthi 
580fc54766bSSrikanth Yalavarthi 	input_buffer = (uint16_t *)input;
581fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
582fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
583fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
584fc54766bSSrikanth Yalavarthi 
585fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
586fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
587fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
588fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
589fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
590fc54766bSSrikanth Yalavarthi 	}
591fc54766bSSrikanth Yalavarthi 
592fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
593fc54766bSSrikanth Yalavarthi 	i = i * vlen;
594fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
595fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
596fc54766bSSrikanth Yalavarthi 		input_buffer++;
597fc54766bSSrikanth Yalavarthi 		output_buffer++;
598fc54766bSSrikanth Yalavarthi 	}
599fc54766bSSrikanth Yalavarthi 
600fc54766bSSrikanth Yalavarthi 	return 0;
601fc54766bSSrikanth Yalavarthi }
602fc54766bSSrikanth Yalavarthi 
603fc54766bSSrikanth Yalavarthi static inline void
60450513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output)
60550513ae5SSrikanth Yalavarthi {
60650513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
60750513ae5SSrikanth Yalavarthi 	int32x4_t s32x4;
60850513ae5SSrikanth Yalavarthi 
60950513ae5SSrikanth Yalavarthi 	/* load 4 x float elements */
61050513ae5SSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
61150513ae5SSrikanth Yalavarthi 
61250513ae5SSrikanth Yalavarthi 	/* scale */
61350513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
61450513ae5SSrikanth Yalavarthi 
61550513ae5SSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
61650513ae5SSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
61750513ae5SSrikanth Yalavarthi 
61850513ae5SSrikanth Yalavarthi 	/* store 4 elements */
61950513ae5SSrikanth Yalavarthi 	vst1q_s32(output, s32x4);
62050513ae5SSrikanth Yalavarthi }
62150513ae5SSrikanth Yalavarthi 
62250513ae5SSrikanth Yalavarthi static inline void
62350513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output)
62450513ae5SSrikanth Yalavarthi {
62550513ae5SSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
62650513ae5SSrikanth Yalavarthi 	*output = vcvtas_s32_f32(scale * (*input));
62750513ae5SSrikanth Yalavarthi }
62850513ae5SSrikanth Yalavarthi 
62950513ae5SSrikanth Yalavarthi int
63050513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, void *output)
63150513ae5SSrikanth Yalavarthi {
63250513ae5SSrikanth Yalavarthi 	float *input_buffer;
63350513ae5SSrikanth Yalavarthi 	int32_t *output_buffer;
63450513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
63550513ae5SSrikanth Yalavarthi 	uint32_t vlen;
63650513ae5SSrikanth Yalavarthi 	uint64_t i;
63750513ae5SSrikanth Yalavarthi 
63850513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
63950513ae5SSrikanth Yalavarthi 		return -EINVAL;
64050513ae5SSrikanth Yalavarthi 
64150513ae5SSrikanth Yalavarthi 	input_buffer = (float *)input;
64250513ae5SSrikanth Yalavarthi 	output_buffer = (int32_t *)output;
64350513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int32_t);
64450513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
64550513ae5SSrikanth Yalavarthi 
64650513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
64750513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
64850513ae5SSrikanth Yalavarthi 		__float32_to_int32_neon_s32x4(scale, input_buffer, output_buffer);
64950513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
65050513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
65150513ae5SSrikanth Yalavarthi 	}
65250513ae5SSrikanth Yalavarthi 
65350513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
65450513ae5SSrikanth Yalavarthi 	i = i * vlen;
65550513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
65650513ae5SSrikanth Yalavarthi 		__float32_to_int32_neon_s32x1(scale, input_buffer, output_buffer);
65750513ae5SSrikanth Yalavarthi 		input_buffer++;
65850513ae5SSrikanth Yalavarthi 		output_buffer++;
65950513ae5SSrikanth Yalavarthi 	}
66050513ae5SSrikanth Yalavarthi 
66150513ae5SSrikanth Yalavarthi 	return 0;
66250513ae5SSrikanth Yalavarthi }
66350513ae5SSrikanth Yalavarthi 
66450513ae5SSrikanth Yalavarthi static inline void
66550513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output)
66650513ae5SSrikanth Yalavarthi {
66750513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
66850513ae5SSrikanth Yalavarthi 	int32x4_t s32x4;
66950513ae5SSrikanth Yalavarthi 
67050513ae5SSrikanth Yalavarthi 	/* load 4 x int32_t elements */
67150513ae5SSrikanth Yalavarthi 	s32x4 = vld1q_s32(input);
67250513ae5SSrikanth Yalavarthi 
67350513ae5SSrikanth Yalavarthi 	/* convert int32_t to float */
67450513ae5SSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
67550513ae5SSrikanth Yalavarthi 
67650513ae5SSrikanth Yalavarthi 	/* scale */
67750513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
67850513ae5SSrikanth Yalavarthi 
67950513ae5SSrikanth Yalavarthi 	/* store float32x4_t */
68050513ae5SSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
68150513ae5SSrikanth Yalavarthi }
68250513ae5SSrikanth Yalavarthi 
68350513ae5SSrikanth Yalavarthi static inline void
68450513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output)
68550513ae5SSrikanth Yalavarthi {
68650513ae5SSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32(*input);
68750513ae5SSrikanth Yalavarthi }
68850513ae5SSrikanth Yalavarthi 
68950513ae5SSrikanth Yalavarthi int
69050513ae5SSrikanth Yalavarthi rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
69150513ae5SSrikanth Yalavarthi {
69250513ae5SSrikanth Yalavarthi 	int32_t *input_buffer;
69350513ae5SSrikanth Yalavarthi 	float *output_buffer;
69450513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
69550513ae5SSrikanth Yalavarthi 	uint32_t vlen;
69650513ae5SSrikanth Yalavarthi 	uint64_t i;
69750513ae5SSrikanth Yalavarthi 
69850513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
69950513ae5SSrikanth Yalavarthi 		return -EINVAL;
70050513ae5SSrikanth Yalavarthi 
70150513ae5SSrikanth Yalavarthi 	input_buffer = (int32_t *)input;
70250513ae5SSrikanth Yalavarthi 	output_buffer = (float *)output;
70350513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int32_t);
70450513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
70550513ae5SSrikanth Yalavarthi 
70650513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
70750513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
70850513ae5SSrikanth Yalavarthi 		__int32_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
70950513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
71050513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
71150513ae5SSrikanth Yalavarthi 	}
71250513ae5SSrikanth Yalavarthi 
71350513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
71450513ae5SSrikanth Yalavarthi 	i = i * vlen;
71550513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
71650513ae5SSrikanth Yalavarthi 		__int32_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
71750513ae5SSrikanth Yalavarthi 		input_buffer++;
71850513ae5SSrikanth Yalavarthi 		output_buffer++;
71950513ae5SSrikanth Yalavarthi 	}
72050513ae5SSrikanth Yalavarthi 
72150513ae5SSrikanth Yalavarthi 	return 0;
72250513ae5SSrikanth Yalavarthi }
72350513ae5SSrikanth Yalavarthi 
72450513ae5SSrikanth Yalavarthi static inline void
72550513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output)
72650513ae5SSrikanth Yalavarthi {
72750513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
72850513ae5SSrikanth Yalavarthi 	uint32x4_t u32x4;
72950513ae5SSrikanth Yalavarthi 
73050513ae5SSrikanth Yalavarthi 	/* load 4 float elements */
73150513ae5SSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
73250513ae5SSrikanth Yalavarthi 
73350513ae5SSrikanth Yalavarthi 	/* scale */
73450513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
73550513ae5SSrikanth Yalavarthi 
73650513ae5SSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
73750513ae5SSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
73850513ae5SSrikanth Yalavarthi 
73950513ae5SSrikanth Yalavarthi 	/* store 4 elements */
74050513ae5SSrikanth Yalavarthi 	vst1q_u32(output, u32x4);
74150513ae5SSrikanth Yalavarthi }
74250513ae5SSrikanth Yalavarthi 
74350513ae5SSrikanth Yalavarthi static inline void
74450513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output)
74550513ae5SSrikanth Yalavarthi {
74650513ae5SSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
74750513ae5SSrikanth Yalavarthi 	*output = vcvtas_u32_f32(scale * (*input));
74850513ae5SSrikanth Yalavarthi }
74950513ae5SSrikanth Yalavarthi 
75050513ae5SSrikanth Yalavarthi int
75150513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, void *output)
75250513ae5SSrikanth Yalavarthi {
75350513ae5SSrikanth Yalavarthi 	float *input_buffer;
75450513ae5SSrikanth Yalavarthi 	uint32_t *output_buffer;
75550513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
75650513ae5SSrikanth Yalavarthi 	uint64_t vlen;
75750513ae5SSrikanth Yalavarthi 	uint64_t i;
75850513ae5SSrikanth Yalavarthi 
75950513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
76050513ae5SSrikanth Yalavarthi 		return -EINVAL;
76150513ae5SSrikanth Yalavarthi 
76250513ae5SSrikanth Yalavarthi 	input_buffer = (float *)input;
76350513ae5SSrikanth Yalavarthi 	output_buffer = (uint32_t *)output;
76450513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
76550513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
76650513ae5SSrikanth Yalavarthi 
76750513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
76850513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
76950513ae5SSrikanth Yalavarthi 		__float32_to_uint32_neon_u32x4(scale, input_buffer, output_buffer);
77050513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
77150513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
77250513ae5SSrikanth Yalavarthi 	}
77350513ae5SSrikanth Yalavarthi 
77450513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
77550513ae5SSrikanth Yalavarthi 	i = i * vlen;
77650513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
77750513ae5SSrikanth Yalavarthi 		__float32_to_uint32_neon_u32x1(scale, input_buffer, output_buffer);
77850513ae5SSrikanth Yalavarthi 		input_buffer++;
77950513ae5SSrikanth Yalavarthi 		output_buffer++;
78050513ae5SSrikanth Yalavarthi 	}
78150513ae5SSrikanth Yalavarthi 
78250513ae5SSrikanth Yalavarthi 	return 0;
78350513ae5SSrikanth Yalavarthi }
78450513ae5SSrikanth Yalavarthi 
78550513ae5SSrikanth Yalavarthi static inline void
78650513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output)
78750513ae5SSrikanth Yalavarthi {
78850513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
78950513ae5SSrikanth Yalavarthi 	uint32x4_t u32x4;
79050513ae5SSrikanth Yalavarthi 
79150513ae5SSrikanth Yalavarthi 	/* load 4 x uint32_t elements */
79250513ae5SSrikanth Yalavarthi 	u32x4 = vld1q_u32(input);
79350513ae5SSrikanth Yalavarthi 
79450513ae5SSrikanth Yalavarthi 	/* convert uint32_t to float */
79550513ae5SSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
79650513ae5SSrikanth Yalavarthi 
79750513ae5SSrikanth Yalavarthi 	/* scale */
79850513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
79950513ae5SSrikanth Yalavarthi 
80050513ae5SSrikanth Yalavarthi 	/* store float32x4_t */
80150513ae5SSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
80250513ae5SSrikanth Yalavarthi }
80350513ae5SSrikanth Yalavarthi 
80450513ae5SSrikanth Yalavarthi static inline void
80550513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output)
80650513ae5SSrikanth Yalavarthi {
80750513ae5SSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32(*input);
80850513ae5SSrikanth Yalavarthi }
80950513ae5SSrikanth Yalavarthi 
81050513ae5SSrikanth Yalavarthi int
81150513ae5SSrikanth Yalavarthi rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
81250513ae5SSrikanth Yalavarthi {
81350513ae5SSrikanth Yalavarthi 	uint32_t *input_buffer;
81450513ae5SSrikanth Yalavarthi 	float *output_buffer;
81550513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
81650513ae5SSrikanth Yalavarthi 	uint32_t vlen;
81750513ae5SSrikanth Yalavarthi 	uint64_t i;
81850513ae5SSrikanth Yalavarthi 
81950513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
82050513ae5SSrikanth Yalavarthi 		return -EINVAL;
82150513ae5SSrikanth Yalavarthi 
82250513ae5SSrikanth Yalavarthi 	input_buffer = (uint32_t *)input;
82350513ae5SSrikanth Yalavarthi 	output_buffer = (float *)output;
82450513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
82550513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
82650513ae5SSrikanth Yalavarthi 
82750513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
82850513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
82950513ae5SSrikanth Yalavarthi 		__uint32_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
83050513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
83150513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
83250513ae5SSrikanth Yalavarthi 	}
83350513ae5SSrikanth Yalavarthi 
83450513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
83550513ae5SSrikanth Yalavarthi 	i = i * vlen;
83650513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
83750513ae5SSrikanth Yalavarthi 		__uint32_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
83850513ae5SSrikanth Yalavarthi 		input_buffer++;
83950513ae5SSrikanth Yalavarthi 		output_buffer++;
84050513ae5SSrikanth Yalavarthi 	}
84150513ae5SSrikanth Yalavarthi 
84250513ae5SSrikanth Yalavarthi 	return 0;
84350513ae5SSrikanth Yalavarthi }
84450513ae5SSrikanth Yalavarthi 
84550513ae5SSrikanth Yalavarthi static inline void
846*42f3dcd9SSrikanth Yalavarthi __float32_to_int64_neon_s64x2(float scale, float *input, int64_t *output)
847*42f3dcd9SSrikanth Yalavarthi {
848*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
849*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
850*42f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
851*42f3dcd9SSrikanth Yalavarthi 
852*42f3dcd9SSrikanth Yalavarthi 	/* load 2 x float elements */
853*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vld1_f32(input);
854*42f3dcd9SSrikanth Yalavarthi 
855*42f3dcd9SSrikanth Yalavarthi 	/* scale */
856*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
857*42f3dcd9SSrikanth Yalavarthi 
858*42f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
859*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
860*42f3dcd9SSrikanth Yalavarthi 
861*42f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
862*42f3dcd9SSrikanth Yalavarthi 	s64x2 = vcvtaq_s64_f64(f64x2);
863*42f3dcd9SSrikanth Yalavarthi 
864*42f3dcd9SSrikanth Yalavarthi 	/* store 2 elements */
865*42f3dcd9SSrikanth Yalavarthi 	vst1q_s64(output, s64x2);
866*42f3dcd9SSrikanth Yalavarthi }
867*42f3dcd9SSrikanth Yalavarthi 
868*42f3dcd9SSrikanth Yalavarthi static inline void
869*42f3dcd9SSrikanth Yalavarthi __float32_to_int64_neon_s64x1(float scale, float *input, int64_t *output)
870*42f3dcd9SSrikanth Yalavarthi {
871*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
872*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
873*42f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
874*42f3dcd9SSrikanth Yalavarthi 
875*42f3dcd9SSrikanth Yalavarthi 	/* load 1 x float element */
876*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vdup_n_f32(*input);
877*42f3dcd9SSrikanth Yalavarthi 
878*42f3dcd9SSrikanth Yalavarthi 	/* scale */
879*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
880*42f3dcd9SSrikanth Yalavarthi 
881*42f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
882*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
883*42f3dcd9SSrikanth Yalavarthi 
884*42f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
885*42f3dcd9SSrikanth Yalavarthi 	s64x2 = vcvtaq_s64_f64(f64x2);
886*42f3dcd9SSrikanth Yalavarthi 
887*42f3dcd9SSrikanth Yalavarthi 	/* store lane 0 of int64x2_t */
888*42f3dcd9SSrikanth Yalavarthi 	vst1q_lane_s64(output, s64x2, 0);
889*42f3dcd9SSrikanth Yalavarthi }
890*42f3dcd9SSrikanth Yalavarthi 
891*42f3dcd9SSrikanth Yalavarthi int
892*42f3dcd9SSrikanth Yalavarthi rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, void *output)
893*42f3dcd9SSrikanth Yalavarthi {
894*42f3dcd9SSrikanth Yalavarthi 	float *input_buffer;
895*42f3dcd9SSrikanth Yalavarthi 	int64_t *output_buffer;
896*42f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
897*42f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
898*42f3dcd9SSrikanth Yalavarthi 	uint64_t i;
899*42f3dcd9SSrikanth Yalavarthi 
900*42f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
901*42f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
902*42f3dcd9SSrikanth Yalavarthi 
903*42f3dcd9SSrikanth Yalavarthi 	input_buffer = (float *)input;
904*42f3dcd9SSrikanth Yalavarthi 	output_buffer = (int64_t *)output;
905*42f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(int64_t);
906*42f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
907*42f3dcd9SSrikanth Yalavarthi 
908*42f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
909*42f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
910*42f3dcd9SSrikanth Yalavarthi 		__float32_to_int64_neon_s64x2(scale, input_buffer, output_buffer);
911*42f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
912*42f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
913*42f3dcd9SSrikanth Yalavarthi 	}
914*42f3dcd9SSrikanth Yalavarthi 
915*42f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
916*42f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
917*42f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
918*42f3dcd9SSrikanth Yalavarthi 		__float32_to_int64_neon_s64x1(scale, input_buffer, output_buffer);
919*42f3dcd9SSrikanth Yalavarthi 		input_buffer++;
920*42f3dcd9SSrikanth Yalavarthi 		output_buffer++;
921*42f3dcd9SSrikanth Yalavarthi 	}
922*42f3dcd9SSrikanth Yalavarthi 
923*42f3dcd9SSrikanth Yalavarthi 	return 0;
924*42f3dcd9SSrikanth Yalavarthi }
925*42f3dcd9SSrikanth Yalavarthi 
926*42f3dcd9SSrikanth Yalavarthi static inline void
927*42f3dcd9SSrikanth Yalavarthi __int64_to_float32_neon_f32x2(float scale, int64_t *input, float *output)
928*42f3dcd9SSrikanth Yalavarthi {
929*42f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
930*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
931*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
932*42f3dcd9SSrikanth Yalavarthi 
933*42f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
934*42f3dcd9SSrikanth Yalavarthi 	s64x2 = vld1q_s64(input);
935*42f3dcd9SSrikanth Yalavarthi 
936*42f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
937*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_s64(s64x2);
938*42f3dcd9SSrikanth Yalavarthi 
939*42f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
940*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
941*42f3dcd9SSrikanth Yalavarthi 
942*42f3dcd9SSrikanth Yalavarthi 	/* scale */
943*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
944*42f3dcd9SSrikanth Yalavarthi 
945*42f3dcd9SSrikanth Yalavarthi 	/* store float32x2_t */
946*42f3dcd9SSrikanth Yalavarthi 	vst1_f32(output, f32x2);
947*42f3dcd9SSrikanth Yalavarthi }
948*42f3dcd9SSrikanth Yalavarthi 
949*42f3dcd9SSrikanth Yalavarthi static inline void
950*42f3dcd9SSrikanth Yalavarthi __int64_to_float32_neon_f32x1(float scale, int64_t *input, float *output)
951*42f3dcd9SSrikanth Yalavarthi {
952*42f3dcd9SSrikanth Yalavarthi 	int64x2_t s64x2;
953*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
954*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
955*42f3dcd9SSrikanth Yalavarthi 
956*42f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
957*42f3dcd9SSrikanth Yalavarthi 	s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0);
958*42f3dcd9SSrikanth Yalavarthi 
959*42f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
960*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_s64(s64x2);
961*42f3dcd9SSrikanth Yalavarthi 
962*42f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
963*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
964*42f3dcd9SSrikanth Yalavarthi 
965*42f3dcd9SSrikanth Yalavarthi 	/* scale */
966*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
967*42f3dcd9SSrikanth Yalavarthi 
968*42f3dcd9SSrikanth Yalavarthi 	/* store float32x2_t */
969*42f3dcd9SSrikanth Yalavarthi 	vst1_lane_f32(output, f32x2, 0);
970*42f3dcd9SSrikanth Yalavarthi }
971*42f3dcd9SSrikanth Yalavarthi 
972*42f3dcd9SSrikanth Yalavarthi int
973*42f3dcd9SSrikanth Yalavarthi rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
974*42f3dcd9SSrikanth Yalavarthi {
975*42f3dcd9SSrikanth Yalavarthi 	int64_t *input_buffer;
976*42f3dcd9SSrikanth Yalavarthi 	float *output_buffer;
977*42f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
978*42f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
979*42f3dcd9SSrikanth Yalavarthi 	uint64_t i;
980*42f3dcd9SSrikanth Yalavarthi 
981*42f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
982*42f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
983*42f3dcd9SSrikanth Yalavarthi 
984*42f3dcd9SSrikanth Yalavarthi 	input_buffer = (int64_t *)input;
985*42f3dcd9SSrikanth Yalavarthi 	output_buffer = (float *)output;
986*42f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(int64_t);
987*42f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
988*42f3dcd9SSrikanth Yalavarthi 
989*42f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
990*42f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
991*42f3dcd9SSrikanth Yalavarthi 		__int64_to_float32_neon_f32x2(scale, input_buffer, output_buffer);
992*42f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
993*42f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
994*42f3dcd9SSrikanth Yalavarthi 	}
995*42f3dcd9SSrikanth Yalavarthi 
996*42f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
997*42f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
998*42f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
999*42f3dcd9SSrikanth Yalavarthi 		__int64_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
1000*42f3dcd9SSrikanth Yalavarthi 		input_buffer++;
1001*42f3dcd9SSrikanth Yalavarthi 		output_buffer++;
1002*42f3dcd9SSrikanth Yalavarthi 	}
1003*42f3dcd9SSrikanth Yalavarthi 
1004*42f3dcd9SSrikanth Yalavarthi 	return 0;
1005*42f3dcd9SSrikanth Yalavarthi }
1006*42f3dcd9SSrikanth Yalavarthi 
1007*42f3dcd9SSrikanth Yalavarthi static inline void
1008*42f3dcd9SSrikanth Yalavarthi __float32_to_uint64_neon_u64x2(float scale, float *input, uint64_t *output)
1009*42f3dcd9SSrikanth Yalavarthi {
1010*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
1011*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
1012*42f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
1013*42f3dcd9SSrikanth Yalavarthi 
1014*42f3dcd9SSrikanth Yalavarthi 	/* load 2 x float elements */
1015*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vld1_f32(input);
1016*42f3dcd9SSrikanth Yalavarthi 
1017*42f3dcd9SSrikanth Yalavarthi 	/* scale */
1018*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
1019*42f3dcd9SSrikanth Yalavarthi 
1020*42f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
1021*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
1022*42f3dcd9SSrikanth Yalavarthi 
1023*42f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
1024*42f3dcd9SSrikanth Yalavarthi 	u64x2 = vcvtaq_u64_f64(f64x2);
1025*42f3dcd9SSrikanth Yalavarthi 
1026*42f3dcd9SSrikanth Yalavarthi 	/* store 2 elements */
1027*42f3dcd9SSrikanth Yalavarthi 	vst1q_u64(output, u64x2);
1028*42f3dcd9SSrikanth Yalavarthi }
1029*42f3dcd9SSrikanth Yalavarthi 
1030*42f3dcd9SSrikanth Yalavarthi static inline void
1031*42f3dcd9SSrikanth Yalavarthi __float32_to_uint64_neon_u64x1(float scale, float *input, uint64_t *output)
1032*42f3dcd9SSrikanth Yalavarthi {
1033*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
1034*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
1035*42f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
1036*42f3dcd9SSrikanth Yalavarthi 
1037*42f3dcd9SSrikanth Yalavarthi 	/* load 1 x float element */
1038*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0);
1039*42f3dcd9SSrikanth Yalavarthi 
1040*42f3dcd9SSrikanth Yalavarthi 	/* scale */
1041*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
1042*42f3dcd9SSrikanth Yalavarthi 
1043*42f3dcd9SSrikanth Yalavarthi 	/* convert to float64x2_t */
1044*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvt_f64_f32(f32x2);
1045*42f3dcd9SSrikanth Yalavarthi 
1046*42f3dcd9SSrikanth Yalavarthi 	/* convert to int64x2_t */
1047*42f3dcd9SSrikanth Yalavarthi 	u64x2 = vcvtaq_u64_f64(f64x2);
1048*42f3dcd9SSrikanth Yalavarthi 
1049*42f3dcd9SSrikanth Yalavarthi 	/* store 2 elements */
1050*42f3dcd9SSrikanth Yalavarthi 	vst1q_lane_u64(output, u64x2, 0);
1051*42f3dcd9SSrikanth Yalavarthi }
1052*42f3dcd9SSrikanth Yalavarthi 
1053*42f3dcd9SSrikanth Yalavarthi int
1054*42f3dcd9SSrikanth Yalavarthi rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, void *output)
1055*42f3dcd9SSrikanth Yalavarthi {
1056*42f3dcd9SSrikanth Yalavarthi 	float *input_buffer;
1057*42f3dcd9SSrikanth Yalavarthi 	uint64_t *output_buffer;
1058*42f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
1059*42f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
1060*42f3dcd9SSrikanth Yalavarthi 	uint64_t i;
1061*42f3dcd9SSrikanth Yalavarthi 
1062*42f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1063*42f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
1064*42f3dcd9SSrikanth Yalavarthi 
1065*42f3dcd9SSrikanth Yalavarthi 	input_buffer = (float *)input;
1066*42f3dcd9SSrikanth Yalavarthi 	output_buffer = (uint64_t *)output;
1067*42f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
1068*42f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
1069*42f3dcd9SSrikanth Yalavarthi 
1070*42f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
1071*42f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1072*42f3dcd9SSrikanth Yalavarthi 		__float32_to_uint64_neon_u64x2(scale, input_buffer, output_buffer);
1073*42f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
1074*42f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
1075*42f3dcd9SSrikanth Yalavarthi 	}
1076*42f3dcd9SSrikanth Yalavarthi 
1077*42f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
1078*42f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
1079*42f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1080*42f3dcd9SSrikanth Yalavarthi 		__float32_to_uint64_neon_u64x1(scale, input_buffer, output_buffer);
1081*42f3dcd9SSrikanth Yalavarthi 		input_buffer++;
1082*42f3dcd9SSrikanth Yalavarthi 		output_buffer++;
1083*42f3dcd9SSrikanth Yalavarthi 	}
1084*42f3dcd9SSrikanth Yalavarthi 
1085*42f3dcd9SSrikanth Yalavarthi 	return 0;
1086*42f3dcd9SSrikanth Yalavarthi }
1087*42f3dcd9SSrikanth Yalavarthi 
1088*42f3dcd9SSrikanth Yalavarthi static inline void
1089*42f3dcd9SSrikanth Yalavarthi __uint64_to_float32_neon_f32x2(float scale, uint64_t *input, float *output)
1090*42f3dcd9SSrikanth Yalavarthi {
1091*42f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
1092*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
1093*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
1094*42f3dcd9SSrikanth Yalavarthi 
1095*42f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
1096*42f3dcd9SSrikanth Yalavarthi 	u64x2 = vld1q_u64(input);
1097*42f3dcd9SSrikanth Yalavarthi 
1098*42f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
1099*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_u64(u64x2);
1100*42f3dcd9SSrikanth Yalavarthi 
1101*42f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
1102*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
1103*42f3dcd9SSrikanth Yalavarthi 
1104*42f3dcd9SSrikanth Yalavarthi 	/* scale */
1105*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
1106*42f3dcd9SSrikanth Yalavarthi 
1107*42f3dcd9SSrikanth Yalavarthi 	/* store float32x2_t */
1108*42f3dcd9SSrikanth Yalavarthi 	vst1_f32(output, f32x2);
1109*42f3dcd9SSrikanth Yalavarthi }
1110*42f3dcd9SSrikanth Yalavarthi 
1111*42f3dcd9SSrikanth Yalavarthi static inline void
1112*42f3dcd9SSrikanth Yalavarthi __uint64_to_float32_neon_f32x1(float scale, uint64_t *input, float *output)
1113*42f3dcd9SSrikanth Yalavarthi {
1114*42f3dcd9SSrikanth Yalavarthi 	uint64x2_t u64x2;
1115*42f3dcd9SSrikanth Yalavarthi 	float64x2_t f64x2;
1116*42f3dcd9SSrikanth Yalavarthi 	float32x2_t f32x2;
1117*42f3dcd9SSrikanth Yalavarthi 
1118*42f3dcd9SSrikanth Yalavarthi 	/* load 2 x int64_t elements */
1119*42f3dcd9SSrikanth Yalavarthi 	u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0);
1120*42f3dcd9SSrikanth Yalavarthi 
1121*42f3dcd9SSrikanth Yalavarthi 	/* convert int64x2_t to float64x2_t */
1122*42f3dcd9SSrikanth Yalavarthi 	f64x2 = vcvtq_f64_u64(u64x2);
1123*42f3dcd9SSrikanth Yalavarthi 
1124*42f3dcd9SSrikanth Yalavarthi 	/* convert float64x2_t to float32x2_t */
1125*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vcvt_f32_f64(f64x2);
1126*42f3dcd9SSrikanth Yalavarthi 
1127*42f3dcd9SSrikanth Yalavarthi 	/* scale */
1128*42f3dcd9SSrikanth Yalavarthi 	f32x2 = vmul_n_f32(f32x2, scale);
1129*42f3dcd9SSrikanth Yalavarthi 
1130*42f3dcd9SSrikanth Yalavarthi 	/* store float32x2_t */
1131*42f3dcd9SSrikanth Yalavarthi 	vst1_lane_f32(output, f32x2, 0);
1132*42f3dcd9SSrikanth Yalavarthi }
1133*42f3dcd9SSrikanth Yalavarthi 
1134*42f3dcd9SSrikanth Yalavarthi int
1135*42f3dcd9SSrikanth Yalavarthi rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
1136*42f3dcd9SSrikanth Yalavarthi {
1137*42f3dcd9SSrikanth Yalavarthi 	uint64_t *input_buffer;
1138*42f3dcd9SSrikanth Yalavarthi 	float *output_buffer;
1139*42f3dcd9SSrikanth Yalavarthi 	uint64_t nb_iterations;
1140*42f3dcd9SSrikanth Yalavarthi 	uint32_t vlen;
1141*42f3dcd9SSrikanth Yalavarthi 	uint64_t i;
1142*42f3dcd9SSrikanth Yalavarthi 
1143*42f3dcd9SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1144*42f3dcd9SSrikanth Yalavarthi 		return -EINVAL;
1145*42f3dcd9SSrikanth Yalavarthi 
1146*42f3dcd9SSrikanth Yalavarthi 	input_buffer = (uint64_t *)input;
1147*42f3dcd9SSrikanth Yalavarthi 	output_buffer = (float *)output;
1148*42f3dcd9SSrikanth Yalavarthi 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
1149*42f3dcd9SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
1150*42f3dcd9SSrikanth Yalavarthi 
1151*42f3dcd9SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
1152*42f3dcd9SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1153*42f3dcd9SSrikanth Yalavarthi 		__uint64_to_float32_neon_f32x2(scale, input_buffer, output_buffer);
1154*42f3dcd9SSrikanth Yalavarthi 		input_buffer += vlen;
1155*42f3dcd9SSrikanth Yalavarthi 		output_buffer += vlen;
1156*42f3dcd9SSrikanth Yalavarthi 	}
1157*42f3dcd9SSrikanth Yalavarthi 
1158*42f3dcd9SSrikanth Yalavarthi 	/* convert leftover elements */
1159*42f3dcd9SSrikanth Yalavarthi 	i = i * vlen;
1160*42f3dcd9SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1161*42f3dcd9SSrikanth Yalavarthi 		__uint64_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
1162*42f3dcd9SSrikanth Yalavarthi 		input_buffer++;
1163*42f3dcd9SSrikanth Yalavarthi 		output_buffer++;
1164*42f3dcd9SSrikanth Yalavarthi 	}
1165*42f3dcd9SSrikanth Yalavarthi 
1166*42f3dcd9SSrikanth Yalavarthi 	return 0;
1167*42f3dcd9SSrikanth Yalavarthi }
1168*42f3dcd9SSrikanth Yalavarthi 
1169*42f3dcd9SSrikanth Yalavarthi static inline void
1170fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
1171fc54766bSSrikanth Yalavarthi {
1172fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1173fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1174fc54766bSSrikanth Yalavarthi 
1175fc54766bSSrikanth Yalavarthi 	/* load 4 x float32_t elements */
1176fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
1177fc54766bSSrikanth Yalavarthi 
1178fc54766bSSrikanth Yalavarthi 	/* convert to float16x4_t */
1179fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
1180fc54766bSSrikanth Yalavarthi 
1181fc54766bSSrikanth Yalavarthi 	/* store float16x4_t */
1182fc54766bSSrikanth Yalavarthi 	vst1_f16(output, f16x4);
1183fc54766bSSrikanth Yalavarthi }
1184fc54766bSSrikanth Yalavarthi 
1185fc54766bSSrikanth Yalavarthi static inline void
1186fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
1187fc54766bSSrikanth Yalavarthi {
1188fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1189fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1190fc54766bSSrikanth Yalavarthi 
1191fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
1192fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_dup_f32(input);
1193fc54766bSSrikanth Yalavarthi 
1194fc54766bSSrikanth Yalavarthi 	/* convert float32_t to float16_t */
1195fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
1196fc54766bSSrikanth Yalavarthi 
1197fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
1198fc54766bSSrikanth Yalavarthi 	vst1_lane_f16(output, f16x4, 0);
1199fc54766bSSrikanth Yalavarthi }
1200fc54766bSSrikanth Yalavarthi 
1201fc54766bSSrikanth Yalavarthi int
1202fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
1203fc54766bSSrikanth Yalavarthi {
1204fc54766bSSrikanth Yalavarthi 	float32_t *input_buffer;
1205fc54766bSSrikanth Yalavarthi 	float16_t *output_buffer;
1206fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
1207fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
1208fc54766bSSrikanth Yalavarthi 	uint64_t i;
1209fc54766bSSrikanth Yalavarthi 
1210fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1211fc54766bSSrikanth Yalavarthi 		return -EINVAL;
1212fc54766bSSrikanth Yalavarthi 
1213fc54766bSSrikanth Yalavarthi 	input_buffer = (float32_t *)input;
1214fc54766bSSrikanth Yalavarthi 	output_buffer = (float16_t *)output;
1215fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1216fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
1217fc54766bSSrikanth Yalavarthi 
1218fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
1219fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1220fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
1221fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
1222fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
1223fc54766bSSrikanth Yalavarthi 	}
1224fc54766bSSrikanth Yalavarthi 
1225fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
1226fc54766bSSrikanth Yalavarthi 	i = i * vlen;
1227fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1228fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
1229fc54766bSSrikanth Yalavarthi 		input_buffer++;
1230fc54766bSSrikanth Yalavarthi 		output_buffer++;
1231fc54766bSSrikanth Yalavarthi 	}
1232fc54766bSSrikanth Yalavarthi 
1233fc54766bSSrikanth Yalavarthi 	return 0;
1234fc54766bSSrikanth Yalavarthi }
1235fc54766bSSrikanth Yalavarthi 
1236fc54766bSSrikanth Yalavarthi static inline void
1237fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
1238fc54766bSSrikanth Yalavarthi {
1239fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1240fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1241fc54766bSSrikanth Yalavarthi 
1242fc54766bSSrikanth Yalavarthi 	/* load 4 x float16_t elements */
1243fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_f16(input);
1244fc54766bSSrikanth Yalavarthi 
1245fc54766bSSrikanth Yalavarthi 	/* convert float16x4_t to float32x4_t */
1246fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
1247fc54766bSSrikanth Yalavarthi 
1248fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
1249fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
1250fc54766bSSrikanth Yalavarthi }
1251fc54766bSSrikanth Yalavarthi 
1252fc54766bSSrikanth Yalavarthi static inline void
1253fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
1254fc54766bSSrikanth Yalavarthi {
1255fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
1256fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
1257fc54766bSSrikanth Yalavarthi 
1258fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
1259fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_dup_f16(input);
1260fc54766bSSrikanth Yalavarthi 
1261fc54766bSSrikanth Yalavarthi 	/* convert float16_t to float32_t */
1262fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
1263fc54766bSSrikanth Yalavarthi 
1264fc54766bSSrikanth Yalavarthi 	/* store 1 element */
1265fc54766bSSrikanth Yalavarthi 	vst1q_lane_f32(output, f32x4, 0);
1266fc54766bSSrikanth Yalavarthi }
1267fc54766bSSrikanth Yalavarthi 
1268fc54766bSSrikanth Yalavarthi int
1269fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
1270fc54766bSSrikanth Yalavarthi {
1271fc54766bSSrikanth Yalavarthi 	float16_t *input_buffer;
1272fc54766bSSrikanth Yalavarthi 	float32_t *output_buffer;
1273fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
1274fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
1275fc54766bSSrikanth Yalavarthi 	uint64_t i;
1276fc54766bSSrikanth Yalavarthi 
1277fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1278fc54766bSSrikanth Yalavarthi 		return -EINVAL;
1279fc54766bSSrikanth Yalavarthi 
1280fc54766bSSrikanth Yalavarthi 	input_buffer = (float16_t *)input;
1281fc54766bSSrikanth Yalavarthi 	output_buffer = (float32_t *)output;
1282fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1283fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
1284fc54766bSSrikanth Yalavarthi 
1285fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
1286fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
1287fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
1288fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
1289fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
1290fc54766bSSrikanth Yalavarthi 	}
1291fc54766bSSrikanth Yalavarthi 
1292fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
1293fc54766bSSrikanth Yalavarthi 	i = i * vlen;
1294fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
1295fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
1296fc54766bSSrikanth Yalavarthi 		input_buffer++;
1297fc54766bSSrikanth Yalavarthi 		output_buffer++;
1298fc54766bSSrikanth Yalavarthi 	}
1299fc54766bSSrikanth Yalavarthi 
1300fc54766bSSrikanth Yalavarthi 	return 0;
1301fc54766bSSrikanth Yalavarthi }
1302