xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision 50513ae53ea9c4cc35b4d7d5df6361da77f77cbb)
1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause
2fc54766bSSrikanth Yalavarthi  * Copyright (c) 2022 Marvell.
3fc54766bSSrikanth Yalavarthi  */
4fc54766bSSrikanth Yalavarthi 
5fc54766bSSrikanth Yalavarthi #include <errno.h>
6fc54766bSSrikanth Yalavarthi #include <stdint.h>
7fc54766bSSrikanth Yalavarthi #include <stdlib.h>
8fc54766bSSrikanth Yalavarthi 
9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h"
10fc54766bSSrikanth Yalavarthi 
11fc54766bSSrikanth Yalavarthi #include <arm_neon.h>
12fc54766bSSrikanth Yalavarthi 
13fc54766bSSrikanth Yalavarthi /* Description:
14fc54766bSSrikanth Yalavarthi  * This file implements vector versions of Machine Learning utility functions used to convert data
15538f6997SSrikanth Yalavarthi  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16538f6997SSrikanth Yalavarthi  * is based on Arm Neon intrinsics.
17fc54766bSSrikanth Yalavarthi  */
18fc54766bSSrikanth Yalavarthi 
19fc54766bSSrikanth Yalavarthi static inline void
20fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
21fc54766bSSrikanth Yalavarthi {
22fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_l;
23fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_h;
24fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
25fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
26fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
27fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
28fc54766bSSrikanth Yalavarthi 
29fc54766bSSrikanth Yalavarthi 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
31fc54766bSSrikanth Yalavarthi 	 */
32fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
33fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
34fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
35fc54766bSSrikanth Yalavarthi 	s16x4_l = vqmovn_s32(s32x4);
36fc54766bSSrikanth Yalavarthi 
37fc54766bSSrikanth Yalavarthi 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
38fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
39fc54766bSSrikanth Yalavarthi 	 */
40fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
41fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
42fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
43fc54766bSSrikanth Yalavarthi 	s16x4_h = vqmovn_s32(s32x4);
44fc54766bSSrikanth Yalavarthi 
45fc54766bSSrikanth Yalavarthi 	/* combine lower and higher int16x4_t to int16x8_t */
46fc54766bSSrikanth Yalavarthi 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
47fc54766bSSrikanth Yalavarthi 
48fc54766bSSrikanth Yalavarthi 	/* narrow to int8_t */
49fc54766bSSrikanth Yalavarthi 	s8x8 = vqmovn_s16(s16x8);
50fc54766bSSrikanth Yalavarthi 
51fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
52fc54766bSSrikanth Yalavarthi 	vst1_s8(output, s8x8);
53fc54766bSSrikanth Yalavarthi }
54fc54766bSSrikanth Yalavarthi 
55fc54766bSSrikanth Yalavarthi static inline void
56fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
57fc54766bSSrikanth Yalavarthi {
58fc54766bSSrikanth Yalavarthi 	int32_t s32;
59fc54766bSSrikanth Yalavarthi 	int16_t s16;
60fc54766bSSrikanth Yalavarthi 
61fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
62fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
63fc54766bSSrikanth Yalavarthi 
64fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
65fc54766bSSrikanth Yalavarthi 	s16 = vqmovns_s32(s32);
66fc54766bSSrikanth Yalavarthi 
67fc54766bSSrikanth Yalavarthi 	/* convert to int8_t */
68fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_s16(s16);
69fc54766bSSrikanth Yalavarthi }
70fc54766bSSrikanth Yalavarthi 
71fc54766bSSrikanth Yalavarthi int
72fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
73fc54766bSSrikanth Yalavarthi {
74fc54766bSSrikanth Yalavarthi 	float *input_buffer;
75fc54766bSSrikanth Yalavarthi 	int8_t *output_buffer;
76fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
77fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
78fc54766bSSrikanth Yalavarthi 	uint64_t i;
79fc54766bSSrikanth Yalavarthi 
80fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
81fc54766bSSrikanth Yalavarthi 		return -EINVAL;
82fc54766bSSrikanth Yalavarthi 
83fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
84fc54766bSSrikanth Yalavarthi 	output_buffer = (int8_t *)output;
85fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
86fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
87fc54766bSSrikanth Yalavarthi 
88fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
89fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
90fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
91fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
92fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
93fc54766bSSrikanth Yalavarthi 	}
94fc54766bSSrikanth Yalavarthi 
95fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
96fc54766bSSrikanth Yalavarthi 	i = i * vlen;
97fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
98fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
99fc54766bSSrikanth Yalavarthi 		input_buffer++;
100fc54766bSSrikanth Yalavarthi 		output_buffer++;
101fc54766bSSrikanth Yalavarthi 	}
102fc54766bSSrikanth Yalavarthi 
103fc54766bSSrikanth Yalavarthi 	return 0;
104fc54766bSSrikanth Yalavarthi }
105fc54766bSSrikanth Yalavarthi 
106fc54766bSSrikanth Yalavarthi static inline void
107fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
108fc54766bSSrikanth Yalavarthi {
109fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
110fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
111fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
112fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
113fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
114fc54766bSSrikanth Yalavarthi 
115fc54766bSSrikanth Yalavarthi 	/* load 8 x int8_t elements */
116fc54766bSSrikanth Yalavarthi 	s8x8 = vld1_s8(input);
117fc54766bSSrikanth Yalavarthi 
118fc54766bSSrikanth Yalavarthi 	/* widen int8_t to int16_t */
119fc54766bSSrikanth Yalavarthi 	s16x8 = vmovl_s8(s8x8);
120fc54766bSSrikanth Yalavarthi 
121fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
122fc54766bSSrikanth Yalavarthi 	s16x4 = vget_low_s16(s16x8);
123fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
124fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
125fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
126fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
127fc54766bSSrikanth Yalavarthi 
128fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
129fc54766bSSrikanth Yalavarthi 	s16x4 = vget_high_s16(s16x8);
130fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
131fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
132fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
133fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
134fc54766bSSrikanth Yalavarthi }
135fc54766bSSrikanth Yalavarthi 
136fc54766bSSrikanth Yalavarthi static inline void
137fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
138fc54766bSSrikanth Yalavarthi {
139fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
140fc54766bSSrikanth Yalavarthi }
141fc54766bSSrikanth Yalavarthi 
142fc54766bSSrikanth Yalavarthi int
143fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
144fc54766bSSrikanth Yalavarthi {
145fc54766bSSrikanth Yalavarthi 	int8_t *input_buffer;
146fc54766bSSrikanth Yalavarthi 	float *output_buffer;
147fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
148fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
149fc54766bSSrikanth Yalavarthi 	uint64_t i;
150fc54766bSSrikanth Yalavarthi 
151fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
152fc54766bSSrikanth Yalavarthi 		return -EINVAL;
153fc54766bSSrikanth Yalavarthi 
154fc54766bSSrikanth Yalavarthi 	input_buffer = (int8_t *)input;
155fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
156fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
157fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
158fc54766bSSrikanth Yalavarthi 
159fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
160fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
161fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
162fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
163fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
164fc54766bSSrikanth Yalavarthi 	}
165fc54766bSSrikanth Yalavarthi 
166fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
167fc54766bSSrikanth Yalavarthi 	i = i * vlen;
168fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
169fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
170fc54766bSSrikanth Yalavarthi 		input_buffer++;
171fc54766bSSrikanth Yalavarthi 		output_buffer++;
172fc54766bSSrikanth Yalavarthi 	}
173fc54766bSSrikanth Yalavarthi 
174fc54766bSSrikanth Yalavarthi 	return 0;
175fc54766bSSrikanth Yalavarthi }
176fc54766bSSrikanth Yalavarthi 
177fc54766bSSrikanth Yalavarthi static inline void
178fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
179fc54766bSSrikanth Yalavarthi {
180fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_l;
181fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_h;
182fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
183fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
184fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
185fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
186fc54766bSSrikanth Yalavarthi 
187fc54766bSSrikanth Yalavarthi 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
188fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
189fc54766bSSrikanth Yalavarthi 	 */
190fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
191fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
192fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
193fc54766bSSrikanth Yalavarthi 	u16x4_l = vqmovn_u32(u32x4);
194fc54766bSSrikanth Yalavarthi 
195fc54766bSSrikanth Yalavarthi 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
196fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
197fc54766bSSrikanth Yalavarthi 	 */
198fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
199fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
200fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
201fc54766bSSrikanth Yalavarthi 	u16x4_h = vqmovn_u32(u32x4);
202fc54766bSSrikanth Yalavarthi 
203fc54766bSSrikanth Yalavarthi 	/* combine lower and higher uint16x4_t */
204fc54766bSSrikanth Yalavarthi 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
205fc54766bSSrikanth Yalavarthi 
206fc54766bSSrikanth Yalavarthi 	/* narrow to uint8x8_t */
207fc54766bSSrikanth Yalavarthi 	u8x8 = vqmovn_u16(u16x8);
208fc54766bSSrikanth Yalavarthi 
209fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
210fc54766bSSrikanth Yalavarthi 	vst1_u8(output, u8x8);
211fc54766bSSrikanth Yalavarthi }
212fc54766bSSrikanth Yalavarthi 
213fc54766bSSrikanth Yalavarthi static inline void
214fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
215fc54766bSSrikanth Yalavarthi {
216fc54766bSSrikanth Yalavarthi 	uint32_t u32;
217fc54766bSSrikanth Yalavarthi 	uint16_t u16;
218fc54766bSSrikanth Yalavarthi 
219fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
220fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
221fc54766bSSrikanth Yalavarthi 
222fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
223fc54766bSSrikanth Yalavarthi 	u16 = vqmovns_u32(u32);
224fc54766bSSrikanth Yalavarthi 
225fc54766bSSrikanth Yalavarthi 	/* convert to uint8_t */
226fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_u16(u16);
227fc54766bSSrikanth Yalavarthi }
228fc54766bSSrikanth Yalavarthi 
229fc54766bSSrikanth Yalavarthi int
230fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
231fc54766bSSrikanth Yalavarthi {
232fc54766bSSrikanth Yalavarthi 	float *input_buffer;
233fc54766bSSrikanth Yalavarthi 	uint8_t *output_buffer;
234fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
235fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
236fc54766bSSrikanth Yalavarthi 	uint64_t i;
237fc54766bSSrikanth Yalavarthi 
238fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
239fc54766bSSrikanth Yalavarthi 		return -EINVAL;
240fc54766bSSrikanth Yalavarthi 
241fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
242fc54766bSSrikanth Yalavarthi 	output_buffer = (uint8_t *)output;
243fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
244fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
245fc54766bSSrikanth Yalavarthi 
246fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
247fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
248fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
249fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
250fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
251fc54766bSSrikanth Yalavarthi 	}
252fc54766bSSrikanth Yalavarthi 
253fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
254fc54766bSSrikanth Yalavarthi 	i = i * vlen;
255fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
256fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
257fc54766bSSrikanth Yalavarthi 		input_buffer++;
258fc54766bSSrikanth Yalavarthi 		output_buffer++;
259fc54766bSSrikanth Yalavarthi 	}
260fc54766bSSrikanth Yalavarthi 
261fc54766bSSrikanth Yalavarthi 	return 0;
262fc54766bSSrikanth Yalavarthi }
263fc54766bSSrikanth Yalavarthi 
264fc54766bSSrikanth Yalavarthi static inline void
265fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
266fc54766bSSrikanth Yalavarthi {
267fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
268fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
269fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
270fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
271fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
272fc54766bSSrikanth Yalavarthi 
273fc54766bSSrikanth Yalavarthi 	/* load 8 x uint8_t elements */
274fc54766bSSrikanth Yalavarthi 	u8x8 = vld1_u8(input);
275fc54766bSSrikanth Yalavarthi 
276fc54766bSSrikanth Yalavarthi 	/* widen uint8_t to uint16_t */
277fc54766bSSrikanth Yalavarthi 	u16x8 = vmovl_u8(u8x8);
278fc54766bSSrikanth Yalavarthi 
279fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
280fc54766bSSrikanth Yalavarthi 	u16x4 = vget_low_u16(u16x8);
281fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
282fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
283fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
284fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
285fc54766bSSrikanth Yalavarthi 
286fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
287fc54766bSSrikanth Yalavarthi 	u16x4 = vget_high_u16(u16x8);
288fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
289fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
290fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
291fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
292fc54766bSSrikanth Yalavarthi }
293fc54766bSSrikanth Yalavarthi 
294fc54766bSSrikanth Yalavarthi static inline void
295fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
296fc54766bSSrikanth Yalavarthi {
297fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
298fc54766bSSrikanth Yalavarthi }
299fc54766bSSrikanth Yalavarthi 
300fc54766bSSrikanth Yalavarthi int
301fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
302fc54766bSSrikanth Yalavarthi {
303fc54766bSSrikanth Yalavarthi 	uint8_t *input_buffer;
304fc54766bSSrikanth Yalavarthi 	float *output_buffer;
305fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
306fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
307fc54766bSSrikanth Yalavarthi 	uint64_t i;
308fc54766bSSrikanth Yalavarthi 
309fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
310fc54766bSSrikanth Yalavarthi 		return -EINVAL;
311fc54766bSSrikanth Yalavarthi 
312fc54766bSSrikanth Yalavarthi 	input_buffer = (uint8_t *)input;
313fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
314fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
315fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
316fc54766bSSrikanth Yalavarthi 
317fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
318fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
319fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
320fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
321fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
322fc54766bSSrikanth Yalavarthi 	}
323fc54766bSSrikanth Yalavarthi 
324fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
325fc54766bSSrikanth Yalavarthi 	i = i * vlen;
326fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
327fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
328fc54766bSSrikanth Yalavarthi 		input_buffer++;
329fc54766bSSrikanth Yalavarthi 		output_buffer++;
330fc54766bSSrikanth Yalavarthi 	}
331fc54766bSSrikanth Yalavarthi 
332fc54766bSSrikanth Yalavarthi 	return 0;
333fc54766bSSrikanth Yalavarthi }
334fc54766bSSrikanth Yalavarthi 
335fc54766bSSrikanth Yalavarthi static inline void
336fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
337fc54766bSSrikanth Yalavarthi {
338fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
339fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
340fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
341fc54766bSSrikanth Yalavarthi 
342fc54766bSSrikanth Yalavarthi 	/* load 4 x float elements */
343fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
344fc54766bSSrikanth Yalavarthi 
345fc54766bSSrikanth Yalavarthi 	/* scale */
346fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
347fc54766bSSrikanth Yalavarthi 
348fc54766bSSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
349fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
350fc54766bSSrikanth Yalavarthi 
351fc54766bSSrikanth Yalavarthi 	/* saturate narrow to int16x4_t */
352fc54766bSSrikanth Yalavarthi 	s16x4 = vqmovn_s32(s32x4);
353fc54766bSSrikanth Yalavarthi 
354fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
355fc54766bSSrikanth Yalavarthi 	vst1_s16(output, s16x4);
356fc54766bSSrikanth Yalavarthi }
357fc54766bSSrikanth Yalavarthi 
358fc54766bSSrikanth Yalavarthi static inline void
359fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
360fc54766bSSrikanth Yalavarthi {
361fc54766bSSrikanth Yalavarthi 	int32_t s32;
362fc54766bSSrikanth Yalavarthi 
363fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
364fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
365fc54766bSSrikanth Yalavarthi 
366fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
367fc54766bSSrikanth Yalavarthi 	*output = vqmovns_s32(s32);
368fc54766bSSrikanth Yalavarthi }
369fc54766bSSrikanth Yalavarthi 
370fc54766bSSrikanth Yalavarthi int
371fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
372fc54766bSSrikanth Yalavarthi {
373fc54766bSSrikanth Yalavarthi 	float *input_buffer;
374fc54766bSSrikanth Yalavarthi 	int16_t *output_buffer;
375fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
376fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
377fc54766bSSrikanth Yalavarthi 	uint64_t i;
378fc54766bSSrikanth Yalavarthi 
379fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
380fc54766bSSrikanth Yalavarthi 		return -EINVAL;
381fc54766bSSrikanth Yalavarthi 
382fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
383fc54766bSSrikanth Yalavarthi 	output_buffer = (int16_t *)output;
384fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
385fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
386fc54766bSSrikanth Yalavarthi 
387fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
388fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
389fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
390fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
391fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
392fc54766bSSrikanth Yalavarthi 	}
393fc54766bSSrikanth Yalavarthi 
394fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
395fc54766bSSrikanth Yalavarthi 	i = i * vlen;
396fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
397fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
398fc54766bSSrikanth Yalavarthi 		input_buffer++;
399fc54766bSSrikanth Yalavarthi 		output_buffer++;
400fc54766bSSrikanth Yalavarthi 	}
401fc54766bSSrikanth Yalavarthi 
402fc54766bSSrikanth Yalavarthi 	return 0;
403fc54766bSSrikanth Yalavarthi }
404fc54766bSSrikanth Yalavarthi 
405fc54766bSSrikanth Yalavarthi static inline void
406fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
407fc54766bSSrikanth Yalavarthi {
408fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
409fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
410fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
411fc54766bSSrikanth Yalavarthi 
412fc54766bSSrikanth Yalavarthi 	/* load 4 x int16_t elements */
413fc54766bSSrikanth Yalavarthi 	s16x4 = vld1_s16(input);
414fc54766bSSrikanth Yalavarthi 
415fc54766bSSrikanth Yalavarthi 	/* widen int16_t to int32_t */
416fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
417fc54766bSSrikanth Yalavarthi 
418fc54766bSSrikanth Yalavarthi 	/* convert int32_t to float */
419fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
420fc54766bSSrikanth Yalavarthi 
421fc54766bSSrikanth Yalavarthi 	/* scale */
422fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
423fc54766bSSrikanth Yalavarthi 
424fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
425fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
426fc54766bSSrikanth Yalavarthi }
427fc54766bSSrikanth Yalavarthi 
428fc54766bSSrikanth Yalavarthi static inline void
429fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
430fc54766bSSrikanth Yalavarthi {
431fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
432fc54766bSSrikanth Yalavarthi }
433fc54766bSSrikanth Yalavarthi 
434fc54766bSSrikanth Yalavarthi int
435fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
436fc54766bSSrikanth Yalavarthi {
437fc54766bSSrikanth Yalavarthi 	int16_t *input_buffer;
438fc54766bSSrikanth Yalavarthi 	float *output_buffer;
439fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
440fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
441fc54766bSSrikanth Yalavarthi 	uint64_t i;
442fc54766bSSrikanth Yalavarthi 
443fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
444fc54766bSSrikanth Yalavarthi 		return -EINVAL;
445fc54766bSSrikanth Yalavarthi 
446fc54766bSSrikanth Yalavarthi 	input_buffer = (int16_t *)input;
447fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
448fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
449fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
450fc54766bSSrikanth Yalavarthi 
451fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
452fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
453fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
454fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
455fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
456fc54766bSSrikanth Yalavarthi 	}
457fc54766bSSrikanth Yalavarthi 
458fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
459fc54766bSSrikanth Yalavarthi 	i = i * vlen;
460fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
461fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
462fc54766bSSrikanth Yalavarthi 		input_buffer++;
463fc54766bSSrikanth Yalavarthi 		output_buffer++;
464fc54766bSSrikanth Yalavarthi 	}
465fc54766bSSrikanth Yalavarthi 
466fc54766bSSrikanth Yalavarthi 	return 0;
467fc54766bSSrikanth Yalavarthi }
468fc54766bSSrikanth Yalavarthi 
469fc54766bSSrikanth Yalavarthi static inline void
470fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
471fc54766bSSrikanth Yalavarthi {
472fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
473fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
474fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
475fc54766bSSrikanth Yalavarthi 
476fc54766bSSrikanth Yalavarthi 	/* load 4 float elements */
477fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
478fc54766bSSrikanth Yalavarthi 
479fc54766bSSrikanth Yalavarthi 	/* scale */
480fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
481fc54766bSSrikanth Yalavarthi 
482fc54766bSSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
483fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
484fc54766bSSrikanth Yalavarthi 
485fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
486fc54766bSSrikanth Yalavarthi 	u16x4 = vqmovn_u32(u32x4);
487fc54766bSSrikanth Yalavarthi 
488fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
489fc54766bSSrikanth Yalavarthi 	vst1_u16(output, u16x4);
490fc54766bSSrikanth Yalavarthi }
491fc54766bSSrikanth Yalavarthi 
492fc54766bSSrikanth Yalavarthi static inline void
493fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
494fc54766bSSrikanth Yalavarthi {
495fc54766bSSrikanth Yalavarthi 	uint32_t u32;
496fc54766bSSrikanth Yalavarthi 
497fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
498fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
499fc54766bSSrikanth Yalavarthi 
500fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
501fc54766bSSrikanth Yalavarthi 	*output = vqmovns_u32(u32);
502fc54766bSSrikanth Yalavarthi }
503fc54766bSSrikanth Yalavarthi 
504fc54766bSSrikanth Yalavarthi int
505fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
506fc54766bSSrikanth Yalavarthi {
507fc54766bSSrikanth Yalavarthi 	float *input_buffer;
508fc54766bSSrikanth Yalavarthi 	uint16_t *output_buffer;
509fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
510fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
511fc54766bSSrikanth Yalavarthi 	uint64_t i;
512fc54766bSSrikanth Yalavarthi 
513fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
514fc54766bSSrikanth Yalavarthi 		return -EINVAL;
515fc54766bSSrikanth Yalavarthi 
516fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
517fc54766bSSrikanth Yalavarthi 	output_buffer = (uint16_t *)output;
518fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
519fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
520fc54766bSSrikanth Yalavarthi 
521fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
522fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
523fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
524fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
525fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
526fc54766bSSrikanth Yalavarthi 	}
527fc54766bSSrikanth Yalavarthi 
528fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
529fc54766bSSrikanth Yalavarthi 	i = i * vlen;
530fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
531fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
532fc54766bSSrikanth Yalavarthi 		input_buffer++;
533fc54766bSSrikanth Yalavarthi 		output_buffer++;
534fc54766bSSrikanth Yalavarthi 	}
535fc54766bSSrikanth Yalavarthi 
536fc54766bSSrikanth Yalavarthi 	return 0;
537fc54766bSSrikanth Yalavarthi }
538fc54766bSSrikanth Yalavarthi 
539fc54766bSSrikanth Yalavarthi static inline void
540fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
541fc54766bSSrikanth Yalavarthi {
542fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
543fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
544fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
545fc54766bSSrikanth Yalavarthi 
546fc54766bSSrikanth Yalavarthi 	/* load 4 x uint16_t elements */
547fc54766bSSrikanth Yalavarthi 	u16x4 = vld1_u16(input);
548fc54766bSSrikanth Yalavarthi 
549fc54766bSSrikanth Yalavarthi 	/* widen uint16_t to uint32_t */
550fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
551fc54766bSSrikanth Yalavarthi 
552fc54766bSSrikanth Yalavarthi 	/* convert uint32_t to float */
553fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
554fc54766bSSrikanth Yalavarthi 
555fc54766bSSrikanth Yalavarthi 	/* scale */
556fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
557fc54766bSSrikanth Yalavarthi 
558fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
559fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
560fc54766bSSrikanth Yalavarthi }
561fc54766bSSrikanth Yalavarthi 
562fc54766bSSrikanth Yalavarthi static inline void
563fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
564fc54766bSSrikanth Yalavarthi {
565fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
566fc54766bSSrikanth Yalavarthi }
567fc54766bSSrikanth Yalavarthi 
568fc54766bSSrikanth Yalavarthi int
569fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
570fc54766bSSrikanth Yalavarthi {
571fc54766bSSrikanth Yalavarthi 	uint16_t *input_buffer;
572fc54766bSSrikanth Yalavarthi 	float *output_buffer;
573fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
574fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
575fc54766bSSrikanth Yalavarthi 	uint64_t i;
576fc54766bSSrikanth Yalavarthi 
577fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
578fc54766bSSrikanth Yalavarthi 		return -EINVAL;
579fc54766bSSrikanth Yalavarthi 
580fc54766bSSrikanth Yalavarthi 	input_buffer = (uint16_t *)input;
581fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
582fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
583fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
584fc54766bSSrikanth Yalavarthi 
585fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
586fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
587fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
588fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
589fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
590fc54766bSSrikanth Yalavarthi 	}
591fc54766bSSrikanth Yalavarthi 
592fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
593fc54766bSSrikanth Yalavarthi 	i = i * vlen;
594fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
595fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
596fc54766bSSrikanth Yalavarthi 		input_buffer++;
597fc54766bSSrikanth Yalavarthi 		output_buffer++;
598fc54766bSSrikanth Yalavarthi 	}
599fc54766bSSrikanth Yalavarthi 
600fc54766bSSrikanth Yalavarthi 	return 0;
601fc54766bSSrikanth Yalavarthi }
602fc54766bSSrikanth Yalavarthi 
603fc54766bSSrikanth Yalavarthi static inline void
604*50513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output)
605*50513ae5SSrikanth Yalavarthi {
606*50513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
607*50513ae5SSrikanth Yalavarthi 	int32x4_t s32x4;
608*50513ae5SSrikanth Yalavarthi 
609*50513ae5SSrikanth Yalavarthi 	/* load 4 x float elements */
610*50513ae5SSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
611*50513ae5SSrikanth Yalavarthi 
612*50513ae5SSrikanth Yalavarthi 	/* scale */
613*50513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
614*50513ae5SSrikanth Yalavarthi 
615*50513ae5SSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
616*50513ae5SSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
617*50513ae5SSrikanth Yalavarthi 
618*50513ae5SSrikanth Yalavarthi 	/* store 4 elements */
619*50513ae5SSrikanth Yalavarthi 	vst1q_s32(output, s32x4);
620*50513ae5SSrikanth Yalavarthi }
621*50513ae5SSrikanth Yalavarthi 
622*50513ae5SSrikanth Yalavarthi static inline void
623*50513ae5SSrikanth Yalavarthi __float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output)
624*50513ae5SSrikanth Yalavarthi {
625*50513ae5SSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
626*50513ae5SSrikanth Yalavarthi 	*output = vcvtas_s32_f32(scale * (*input));
627*50513ae5SSrikanth Yalavarthi }
628*50513ae5SSrikanth Yalavarthi 
629*50513ae5SSrikanth Yalavarthi int
630*50513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, void *output)
631*50513ae5SSrikanth Yalavarthi {
632*50513ae5SSrikanth Yalavarthi 	float *input_buffer;
633*50513ae5SSrikanth Yalavarthi 	int32_t *output_buffer;
634*50513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
635*50513ae5SSrikanth Yalavarthi 	uint32_t vlen;
636*50513ae5SSrikanth Yalavarthi 	uint64_t i;
637*50513ae5SSrikanth Yalavarthi 
638*50513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
639*50513ae5SSrikanth Yalavarthi 		return -EINVAL;
640*50513ae5SSrikanth Yalavarthi 
641*50513ae5SSrikanth Yalavarthi 	input_buffer = (float *)input;
642*50513ae5SSrikanth Yalavarthi 	output_buffer = (int32_t *)output;
643*50513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int32_t);
644*50513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
645*50513ae5SSrikanth Yalavarthi 
646*50513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
647*50513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
648*50513ae5SSrikanth Yalavarthi 		__float32_to_int32_neon_s32x4(scale, input_buffer, output_buffer);
649*50513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
650*50513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
651*50513ae5SSrikanth Yalavarthi 	}
652*50513ae5SSrikanth Yalavarthi 
653*50513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
654*50513ae5SSrikanth Yalavarthi 	i = i * vlen;
655*50513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
656*50513ae5SSrikanth Yalavarthi 		__float32_to_int32_neon_s32x1(scale, input_buffer, output_buffer);
657*50513ae5SSrikanth Yalavarthi 		input_buffer++;
658*50513ae5SSrikanth Yalavarthi 		output_buffer++;
659*50513ae5SSrikanth Yalavarthi 	}
660*50513ae5SSrikanth Yalavarthi 
661*50513ae5SSrikanth Yalavarthi 	return 0;
662*50513ae5SSrikanth Yalavarthi }
663*50513ae5SSrikanth Yalavarthi 
664*50513ae5SSrikanth Yalavarthi static inline void
665*50513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output)
666*50513ae5SSrikanth Yalavarthi {
667*50513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
668*50513ae5SSrikanth Yalavarthi 	int32x4_t s32x4;
669*50513ae5SSrikanth Yalavarthi 
670*50513ae5SSrikanth Yalavarthi 	/* load 4 x int32_t elements */
671*50513ae5SSrikanth Yalavarthi 	s32x4 = vld1q_s32(input);
672*50513ae5SSrikanth Yalavarthi 
673*50513ae5SSrikanth Yalavarthi 	/* convert int32_t to float */
674*50513ae5SSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
675*50513ae5SSrikanth Yalavarthi 
676*50513ae5SSrikanth Yalavarthi 	/* scale */
677*50513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
678*50513ae5SSrikanth Yalavarthi 
679*50513ae5SSrikanth Yalavarthi 	/* store float32x4_t */
680*50513ae5SSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
681*50513ae5SSrikanth Yalavarthi }
682*50513ae5SSrikanth Yalavarthi 
683*50513ae5SSrikanth Yalavarthi static inline void
684*50513ae5SSrikanth Yalavarthi __int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output)
685*50513ae5SSrikanth Yalavarthi {
686*50513ae5SSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32(*input);
687*50513ae5SSrikanth Yalavarthi }
688*50513ae5SSrikanth Yalavarthi 
689*50513ae5SSrikanth Yalavarthi int
690*50513ae5SSrikanth Yalavarthi rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
691*50513ae5SSrikanth Yalavarthi {
692*50513ae5SSrikanth Yalavarthi 	int32_t *input_buffer;
693*50513ae5SSrikanth Yalavarthi 	float *output_buffer;
694*50513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
695*50513ae5SSrikanth Yalavarthi 	uint32_t vlen;
696*50513ae5SSrikanth Yalavarthi 	uint64_t i;
697*50513ae5SSrikanth Yalavarthi 
698*50513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
699*50513ae5SSrikanth Yalavarthi 		return -EINVAL;
700*50513ae5SSrikanth Yalavarthi 
701*50513ae5SSrikanth Yalavarthi 	input_buffer = (int32_t *)input;
702*50513ae5SSrikanth Yalavarthi 	output_buffer = (float *)output;
703*50513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int32_t);
704*50513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
705*50513ae5SSrikanth Yalavarthi 
706*50513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
707*50513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
708*50513ae5SSrikanth Yalavarthi 		__int32_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
709*50513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
710*50513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
711*50513ae5SSrikanth Yalavarthi 	}
712*50513ae5SSrikanth Yalavarthi 
713*50513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
714*50513ae5SSrikanth Yalavarthi 	i = i * vlen;
715*50513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
716*50513ae5SSrikanth Yalavarthi 		__int32_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
717*50513ae5SSrikanth Yalavarthi 		input_buffer++;
718*50513ae5SSrikanth Yalavarthi 		output_buffer++;
719*50513ae5SSrikanth Yalavarthi 	}
720*50513ae5SSrikanth Yalavarthi 
721*50513ae5SSrikanth Yalavarthi 	return 0;
722*50513ae5SSrikanth Yalavarthi }
723*50513ae5SSrikanth Yalavarthi 
724*50513ae5SSrikanth Yalavarthi static inline void
725*50513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output)
726*50513ae5SSrikanth Yalavarthi {
727*50513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
728*50513ae5SSrikanth Yalavarthi 	uint32x4_t u32x4;
729*50513ae5SSrikanth Yalavarthi 
730*50513ae5SSrikanth Yalavarthi 	/* load 4 float elements */
731*50513ae5SSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
732*50513ae5SSrikanth Yalavarthi 
733*50513ae5SSrikanth Yalavarthi 	/* scale */
734*50513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
735*50513ae5SSrikanth Yalavarthi 
736*50513ae5SSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
737*50513ae5SSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
738*50513ae5SSrikanth Yalavarthi 
739*50513ae5SSrikanth Yalavarthi 	/* store 4 elements */
740*50513ae5SSrikanth Yalavarthi 	vst1q_u32(output, u32x4);
741*50513ae5SSrikanth Yalavarthi }
742*50513ae5SSrikanth Yalavarthi 
743*50513ae5SSrikanth Yalavarthi static inline void
744*50513ae5SSrikanth Yalavarthi __float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output)
745*50513ae5SSrikanth Yalavarthi {
746*50513ae5SSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
747*50513ae5SSrikanth Yalavarthi 	*output = vcvtas_u32_f32(scale * (*input));
748*50513ae5SSrikanth Yalavarthi }
749*50513ae5SSrikanth Yalavarthi 
750*50513ae5SSrikanth Yalavarthi int
751*50513ae5SSrikanth Yalavarthi rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, void *output)
752*50513ae5SSrikanth Yalavarthi {
753*50513ae5SSrikanth Yalavarthi 	float *input_buffer;
754*50513ae5SSrikanth Yalavarthi 	uint32_t *output_buffer;
755*50513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
756*50513ae5SSrikanth Yalavarthi 	uint64_t vlen;
757*50513ae5SSrikanth Yalavarthi 	uint64_t i;
758*50513ae5SSrikanth Yalavarthi 
759*50513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
760*50513ae5SSrikanth Yalavarthi 		return -EINVAL;
761*50513ae5SSrikanth Yalavarthi 
762*50513ae5SSrikanth Yalavarthi 	input_buffer = (float *)input;
763*50513ae5SSrikanth Yalavarthi 	output_buffer = (uint32_t *)output;
764*50513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
765*50513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
766*50513ae5SSrikanth Yalavarthi 
767*50513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
768*50513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
769*50513ae5SSrikanth Yalavarthi 		__float32_to_uint32_neon_u32x4(scale, input_buffer, output_buffer);
770*50513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
771*50513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
772*50513ae5SSrikanth Yalavarthi 	}
773*50513ae5SSrikanth Yalavarthi 
774*50513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
775*50513ae5SSrikanth Yalavarthi 	i = i * vlen;
776*50513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
777*50513ae5SSrikanth Yalavarthi 		__float32_to_uint32_neon_u32x1(scale, input_buffer, output_buffer);
778*50513ae5SSrikanth Yalavarthi 		input_buffer++;
779*50513ae5SSrikanth Yalavarthi 		output_buffer++;
780*50513ae5SSrikanth Yalavarthi 	}
781*50513ae5SSrikanth Yalavarthi 
782*50513ae5SSrikanth Yalavarthi 	return 0;
783*50513ae5SSrikanth Yalavarthi }
784*50513ae5SSrikanth Yalavarthi 
785*50513ae5SSrikanth Yalavarthi static inline void
786*50513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output)
787*50513ae5SSrikanth Yalavarthi {
788*50513ae5SSrikanth Yalavarthi 	float32x4_t f32x4;
789*50513ae5SSrikanth Yalavarthi 	uint32x4_t u32x4;
790*50513ae5SSrikanth Yalavarthi 
791*50513ae5SSrikanth Yalavarthi 	/* load 4 x uint32_t elements */
792*50513ae5SSrikanth Yalavarthi 	u32x4 = vld1q_u32(input);
793*50513ae5SSrikanth Yalavarthi 
794*50513ae5SSrikanth Yalavarthi 	/* convert uint32_t to float */
795*50513ae5SSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
796*50513ae5SSrikanth Yalavarthi 
797*50513ae5SSrikanth Yalavarthi 	/* scale */
798*50513ae5SSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
799*50513ae5SSrikanth Yalavarthi 
800*50513ae5SSrikanth Yalavarthi 	/* store float32x4_t */
801*50513ae5SSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
802*50513ae5SSrikanth Yalavarthi }
803*50513ae5SSrikanth Yalavarthi 
804*50513ae5SSrikanth Yalavarthi static inline void
805*50513ae5SSrikanth Yalavarthi __uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output)
806*50513ae5SSrikanth Yalavarthi {
807*50513ae5SSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32(*input);
808*50513ae5SSrikanth Yalavarthi }
809*50513ae5SSrikanth Yalavarthi 
810*50513ae5SSrikanth Yalavarthi int
811*50513ae5SSrikanth Yalavarthi rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
812*50513ae5SSrikanth Yalavarthi {
813*50513ae5SSrikanth Yalavarthi 	uint32_t *input_buffer;
814*50513ae5SSrikanth Yalavarthi 	float *output_buffer;
815*50513ae5SSrikanth Yalavarthi 	uint64_t nb_iterations;
816*50513ae5SSrikanth Yalavarthi 	uint32_t vlen;
817*50513ae5SSrikanth Yalavarthi 	uint64_t i;
818*50513ae5SSrikanth Yalavarthi 
819*50513ae5SSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
820*50513ae5SSrikanth Yalavarthi 		return -EINVAL;
821*50513ae5SSrikanth Yalavarthi 
822*50513ae5SSrikanth Yalavarthi 	input_buffer = (uint32_t *)input;
823*50513ae5SSrikanth Yalavarthi 	output_buffer = (float *)output;
824*50513ae5SSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
825*50513ae5SSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
826*50513ae5SSrikanth Yalavarthi 
827*50513ae5SSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
828*50513ae5SSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
829*50513ae5SSrikanth Yalavarthi 		__uint32_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
830*50513ae5SSrikanth Yalavarthi 		input_buffer += vlen;
831*50513ae5SSrikanth Yalavarthi 		output_buffer += vlen;
832*50513ae5SSrikanth Yalavarthi 	}
833*50513ae5SSrikanth Yalavarthi 
834*50513ae5SSrikanth Yalavarthi 	/* convert leftover elements */
835*50513ae5SSrikanth Yalavarthi 	i = i * vlen;
836*50513ae5SSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
837*50513ae5SSrikanth Yalavarthi 		__uint32_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
838*50513ae5SSrikanth Yalavarthi 		input_buffer++;
839*50513ae5SSrikanth Yalavarthi 		output_buffer++;
840*50513ae5SSrikanth Yalavarthi 	}
841*50513ae5SSrikanth Yalavarthi 
842*50513ae5SSrikanth Yalavarthi 	return 0;
843*50513ae5SSrikanth Yalavarthi }
844*50513ae5SSrikanth Yalavarthi 
845*50513ae5SSrikanth Yalavarthi static inline void
846fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
847fc54766bSSrikanth Yalavarthi {
848fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
849fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
850fc54766bSSrikanth Yalavarthi 
851fc54766bSSrikanth Yalavarthi 	/* load 4 x float32_t elements */
852fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
853fc54766bSSrikanth Yalavarthi 
854fc54766bSSrikanth Yalavarthi 	/* convert to float16x4_t */
855fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
856fc54766bSSrikanth Yalavarthi 
857fc54766bSSrikanth Yalavarthi 	/* store float16x4_t */
858fc54766bSSrikanth Yalavarthi 	vst1_f16(output, f16x4);
859fc54766bSSrikanth Yalavarthi }
860fc54766bSSrikanth Yalavarthi 
861fc54766bSSrikanth Yalavarthi static inline void
862fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
863fc54766bSSrikanth Yalavarthi {
864fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
865fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
866fc54766bSSrikanth Yalavarthi 
867fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
868fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_dup_f32(input);
869fc54766bSSrikanth Yalavarthi 
870fc54766bSSrikanth Yalavarthi 	/* convert float32_t to float16_t */
871fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
872fc54766bSSrikanth Yalavarthi 
873fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
874fc54766bSSrikanth Yalavarthi 	vst1_lane_f16(output, f16x4, 0);
875fc54766bSSrikanth Yalavarthi }
876fc54766bSSrikanth Yalavarthi 
877fc54766bSSrikanth Yalavarthi int
878fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
879fc54766bSSrikanth Yalavarthi {
880fc54766bSSrikanth Yalavarthi 	float32_t *input_buffer;
881fc54766bSSrikanth Yalavarthi 	float16_t *output_buffer;
882fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
883fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
884fc54766bSSrikanth Yalavarthi 	uint64_t i;
885fc54766bSSrikanth Yalavarthi 
886fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
887fc54766bSSrikanth Yalavarthi 		return -EINVAL;
888fc54766bSSrikanth Yalavarthi 
889fc54766bSSrikanth Yalavarthi 	input_buffer = (float32_t *)input;
890fc54766bSSrikanth Yalavarthi 	output_buffer = (float16_t *)output;
891fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
892fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
893fc54766bSSrikanth Yalavarthi 
894fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
895fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
896fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
897fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
898fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
899fc54766bSSrikanth Yalavarthi 	}
900fc54766bSSrikanth Yalavarthi 
901fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
902fc54766bSSrikanth Yalavarthi 	i = i * vlen;
903fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
904fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
905fc54766bSSrikanth Yalavarthi 		input_buffer++;
906fc54766bSSrikanth Yalavarthi 		output_buffer++;
907fc54766bSSrikanth Yalavarthi 	}
908fc54766bSSrikanth Yalavarthi 
909fc54766bSSrikanth Yalavarthi 	return 0;
910fc54766bSSrikanth Yalavarthi }
911fc54766bSSrikanth Yalavarthi 
912fc54766bSSrikanth Yalavarthi static inline void
913fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
914fc54766bSSrikanth Yalavarthi {
915fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
916fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
917fc54766bSSrikanth Yalavarthi 
918fc54766bSSrikanth Yalavarthi 	/* load 4 x float16_t elements */
919fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_f16(input);
920fc54766bSSrikanth Yalavarthi 
921fc54766bSSrikanth Yalavarthi 	/* convert float16x4_t to float32x4_t */
922fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
923fc54766bSSrikanth Yalavarthi 
924fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
925fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
926fc54766bSSrikanth Yalavarthi }
927fc54766bSSrikanth Yalavarthi 
928fc54766bSSrikanth Yalavarthi static inline void
929fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
930fc54766bSSrikanth Yalavarthi {
931fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
932fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
933fc54766bSSrikanth Yalavarthi 
934fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
935fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_dup_f16(input);
936fc54766bSSrikanth Yalavarthi 
937fc54766bSSrikanth Yalavarthi 	/* convert float16_t to float32_t */
938fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
939fc54766bSSrikanth Yalavarthi 
940fc54766bSSrikanth Yalavarthi 	/* store 1 element */
941fc54766bSSrikanth Yalavarthi 	vst1q_lane_f32(output, f32x4, 0);
942fc54766bSSrikanth Yalavarthi }
943fc54766bSSrikanth Yalavarthi 
944fc54766bSSrikanth Yalavarthi int
945fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
946fc54766bSSrikanth Yalavarthi {
947fc54766bSSrikanth Yalavarthi 	float16_t *input_buffer;
948fc54766bSSrikanth Yalavarthi 	float32_t *output_buffer;
949fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
950fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
951fc54766bSSrikanth Yalavarthi 	uint64_t i;
952fc54766bSSrikanth Yalavarthi 
953fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
954fc54766bSSrikanth Yalavarthi 		return -EINVAL;
955fc54766bSSrikanth Yalavarthi 
956fc54766bSSrikanth Yalavarthi 	input_buffer = (float16_t *)input;
957fc54766bSSrikanth Yalavarthi 	output_buffer = (float32_t *)output;
958fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
959fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
960fc54766bSSrikanth Yalavarthi 
961fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
962fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
963fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
964fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
965fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
966fc54766bSSrikanth Yalavarthi 	}
967fc54766bSSrikanth Yalavarthi 
968fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
969fc54766bSSrikanth Yalavarthi 	i = i * vlen;
970fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
971fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
972fc54766bSSrikanth Yalavarthi 		input_buffer++;
973fc54766bSSrikanth Yalavarthi 		output_buffer++;
974fc54766bSSrikanth Yalavarthi 	}
975fc54766bSSrikanth Yalavarthi 
976fc54766bSSrikanth Yalavarthi 	return 0;
977fc54766bSSrikanth Yalavarthi }
978