xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision 538f6997da0c808ec1ae6145f180d73fbabc12c7)
1fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause
2fc54766bSSrikanth Yalavarthi  * Copyright (c) 2022 Marvell.
3fc54766bSSrikanth Yalavarthi  */
4fc54766bSSrikanth Yalavarthi 
5fc54766bSSrikanth Yalavarthi #include <errno.h>
6fc54766bSSrikanth Yalavarthi #include <stdint.h>
7fc54766bSSrikanth Yalavarthi #include <stdlib.h>
8fc54766bSSrikanth Yalavarthi 
9fc54766bSSrikanth Yalavarthi #include "mldev_utils.h"
10fc54766bSSrikanth Yalavarthi 
11fc54766bSSrikanth Yalavarthi #include <arm_neon.h>
12fc54766bSSrikanth Yalavarthi 
13fc54766bSSrikanth Yalavarthi /* Description:
14fc54766bSSrikanth Yalavarthi  * This file implements vector versions of Machine Learning utility functions used to convert data
15*538f6997SSrikanth Yalavarthi  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16*538f6997SSrikanth Yalavarthi  * is based on Arm Neon intrinsics.
17fc54766bSSrikanth Yalavarthi  */
18fc54766bSSrikanth Yalavarthi 
19fc54766bSSrikanth Yalavarthi static inline void
20fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
21fc54766bSSrikanth Yalavarthi {
22fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_l;
23fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_h;
24fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
25fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
26fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
27fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
28fc54766bSSrikanth Yalavarthi 
29fc54766bSSrikanth Yalavarthi 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
31fc54766bSSrikanth Yalavarthi 	 */
32fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
33fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
34fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
35fc54766bSSrikanth Yalavarthi 	s16x4_l = vqmovn_s32(s32x4);
36fc54766bSSrikanth Yalavarthi 
37fc54766bSSrikanth Yalavarthi 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
38fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
39fc54766bSSrikanth Yalavarthi 	 */
40fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
41fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
42fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
43fc54766bSSrikanth Yalavarthi 	s16x4_h = vqmovn_s32(s32x4);
44fc54766bSSrikanth Yalavarthi 
45fc54766bSSrikanth Yalavarthi 	/* combine lower and higher int16x4_t to int16x8_t */
46fc54766bSSrikanth Yalavarthi 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
47fc54766bSSrikanth Yalavarthi 
48fc54766bSSrikanth Yalavarthi 	/* narrow to int8_t */
49fc54766bSSrikanth Yalavarthi 	s8x8 = vqmovn_s16(s16x8);
50fc54766bSSrikanth Yalavarthi 
51fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
52fc54766bSSrikanth Yalavarthi 	vst1_s8(output, s8x8);
53fc54766bSSrikanth Yalavarthi }
54fc54766bSSrikanth Yalavarthi 
55fc54766bSSrikanth Yalavarthi static inline void
56fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
57fc54766bSSrikanth Yalavarthi {
58fc54766bSSrikanth Yalavarthi 	int32_t s32;
59fc54766bSSrikanth Yalavarthi 	int16_t s16;
60fc54766bSSrikanth Yalavarthi 
61fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
62fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
63fc54766bSSrikanth Yalavarthi 
64fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
65fc54766bSSrikanth Yalavarthi 	s16 = vqmovns_s32(s32);
66fc54766bSSrikanth Yalavarthi 
67fc54766bSSrikanth Yalavarthi 	/* convert to int8_t */
68fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_s16(s16);
69fc54766bSSrikanth Yalavarthi }
70fc54766bSSrikanth Yalavarthi 
71fc54766bSSrikanth Yalavarthi int
72fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
73fc54766bSSrikanth Yalavarthi {
74fc54766bSSrikanth Yalavarthi 	float *input_buffer;
75fc54766bSSrikanth Yalavarthi 	int8_t *output_buffer;
76fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
77fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
78fc54766bSSrikanth Yalavarthi 	uint64_t i;
79fc54766bSSrikanth Yalavarthi 
80fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
81fc54766bSSrikanth Yalavarthi 		return -EINVAL;
82fc54766bSSrikanth Yalavarthi 
83fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
84fc54766bSSrikanth Yalavarthi 	output_buffer = (int8_t *)output;
85fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
86fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
87fc54766bSSrikanth Yalavarthi 
88fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
89fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
90fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
91fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
92fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
93fc54766bSSrikanth Yalavarthi 	}
94fc54766bSSrikanth Yalavarthi 
95fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
96fc54766bSSrikanth Yalavarthi 	i = i * vlen;
97fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
98fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
99fc54766bSSrikanth Yalavarthi 		input_buffer++;
100fc54766bSSrikanth Yalavarthi 		output_buffer++;
101fc54766bSSrikanth Yalavarthi 	}
102fc54766bSSrikanth Yalavarthi 
103fc54766bSSrikanth Yalavarthi 	return 0;
104fc54766bSSrikanth Yalavarthi }
105fc54766bSSrikanth Yalavarthi 
106fc54766bSSrikanth Yalavarthi static inline void
107fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
108fc54766bSSrikanth Yalavarthi {
109fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
110fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
111fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
112fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
113fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
114fc54766bSSrikanth Yalavarthi 
115fc54766bSSrikanth Yalavarthi 	/* load 8 x int8_t elements */
116fc54766bSSrikanth Yalavarthi 	s8x8 = vld1_s8(input);
117fc54766bSSrikanth Yalavarthi 
118fc54766bSSrikanth Yalavarthi 	/* widen int8_t to int16_t */
119fc54766bSSrikanth Yalavarthi 	s16x8 = vmovl_s8(s8x8);
120fc54766bSSrikanth Yalavarthi 
121fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
122fc54766bSSrikanth Yalavarthi 	s16x4 = vget_low_s16(s16x8);
123fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
124fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
125fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
126fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
127fc54766bSSrikanth Yalavarthi 
128fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
129fc54766bSSrikanth Yalavarthi 	s16x4 = vget_high_s16(s16x8);
130fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
131fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
132fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
133fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
134fc54766bSSrikanth Yalavarthi }
135fc54766bSSrikanth Yalavarthi 
136fc54766bSSrikanth Yalavarthi static inline void
137fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
138fc54766bSSrikanth Yalavarthi {
139fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
140fc54766bSSrikanth Yalavarthi }
141fc54766bSSrikanth Yalavarthi 
142fc54766bSSrikanth Yalavarthi int
143fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
144fc54766bSSrikanth Yalavarthi {
145fc54766bSSrikanth Yalavarthi 	int8_t *input_buffer;
146fc54766bSSrikanth Yalavarthi 	float *output_buffer;
147fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
148fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
149fc54766bSSrikanth Yalavarthi 	uint64_t i;
150fc54766bSSrikanth Yalavarthi 
151fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
152fc54766bSSrikanth Yalavarthi 		return -EINVAL;
153fc54766bSSrikanth Yalavarthi 
154fc54766bSSrikanth Yalavarthi 	input_buffer = (int8_t *)input;
155fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
156fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
157fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
158fc54766bSSrikanth Yalavarthi 
159fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
160fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
161fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
162fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
163fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
164fc54766bSSrikanth Yalavarthi 	}
165fc54766bSSrikanth Yalavarthi 
166fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
167fc54766bSSrikanth Yalavarthi 	i = i * vlen;
168fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
169fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
170fc54766bSSrikanth Yalavarthi 		input_buffer++;
171fc54766bSSrikanth Yalavarthi 		output_buffer++;
172fc54766bSSrikanth Yalavarthi 	}
173fc54766bSSrikanth Yalavarthi 
174fc54766bSSrikanth Yalavarthi 	return 0;
175fc54766bSSrikanth Yalavarthi }
176fc54766bSSrikanth Yalavarthi 
177fc54766bSSrikanth Yalavarthi static inline void
178fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
179fc54766bSSrikanth Yalavarthi {
180fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_l;
181fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_h;
182fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
183fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
184fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
185fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
186fc54766bSSrikanth Yalavarthi 
187fc54766bSSrikanth Yalavarthi 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
188fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
189fc54766bSSrikanth Yalavarthi 	 */
190fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
191fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
192fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
193fc54766bSSrikanth Yalavarthi 	u16x4_l = vqmovn_u32(u32x4);
194fc54766bSSrikanth Yalavarthi 
195fc54766bSSrikanth Yalavarthi 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
196fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
197fc54766bSSrikanth Yalavarthi 	 */
198fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
199fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
200fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
201fc54766bSSrikanth Yalavarthi 	u16x4_h = vqmovn_u32(u32x4);
202fc54766bSSrikanth Yalavarthi 
203fc54766bSSrikanth Yalavarthi 	/* combine lower and higher uint16x4_t */
204fc54766bSSrikanth Yalavarthi 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
205fc54766bSSrikanth Yalavarthi 
206fc54766bSSrikanth Yalavarthi 	/* narrow to uint8x8_t */
207fc54766bSSrikanth Yalavarthi 	u8x8 = vqmovn_u16(u16x8);
208fc54766bSSrikanth Yalavarthi 
209fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
210fc54766bSSrikanth Yalavarthi 	vst1_u8(output, u8x8);
211fc54766bSSrikanth Yalavarthi }
212fc54766bSSrikanth Yalavarthi 
213fc54766bSSrikanth Yalavarthi static inline void
214fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
215fc54766bSSrikanth Yalavarthi {
216fc54766bSSrikanth Yalavarthi 	uint32_t u32;
217fc54766bSSrikanth Yalavarthi 	uint16_t u16;
218fc54766bSSrikanth Yalavarthi 
219fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
220fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
221fc54766bSSrikanth Yalavarthi 
222fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
223fc54766bSSrikanth Yalavarthi 	u16 = vqmovns_u32(u32);
224fc54766bSSrikanth Yalavarthi 
225fc54766bSSrikanth Yalavarthi 	/* convert to uint8_t */
226fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_u16(u16);
227fc54766bSSrikanth Yalavarthi }
228fc54766bSSrikanth Yalavarthi 
229fc54766bSSrikanth Yalavarthi int
230fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
231fc54766bSSrikanth Yalavarthi {
232fc54766bSSrikanth Yalavarthi 	float *input_buffer;
233fc54766bSSrikanth Yalavarthi 	uint8_t *output_buffer;
234fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
235fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
236fc54766bSSrikanth Yalavarthi 	uint64_t i;
237fc54766bSSrikanth Yalavarthi 
238fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
239fc54766bSSrikanth Yalavarthi 		return -EINVAL;
240fc54766bSSrikanth Yalavarthi 
241fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
242fc54766bSSrikanth Yalavarthi 	output_buffer = (uint8_t *)output;
243fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
244fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
245fc54766bSSrikanth Yalavarthi 
246fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
247fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
248fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
249fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
250fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
251fc54766bSSrikanth Yalavarthi 	}
252fc54766bSSrikanth Yalavarthi 
253fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
254fc54766bSSrikanth Yalavarthi 	i = i * vlen;
255fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
256fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
257fc54766bSSrikanth Yalavarthi 		input_buffer++;
258fc54766bSSrikanth Yalavarthi 		output_buffer++;
259fc54766bSSrikanth Yalavarthi 	}
260fc54766bSSrikanth Yalavarthi 
261fc54766bSSrikanth Yalavarthi 	return 0;
262fc54766bSSrikanth Yalavarthi }
263fc54766bSSrikanth Yalavarthi 
264fc54766bSSrikanth Yalavarthi static inline void
265fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
266fc54766bSSrikanth Yalavarthi {
267fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
268fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
269fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
270fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
271fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
272fc54766bSSrikanth Yalavarthi 
273fc54766bSSrikanth Yalavarthi 	/* load 8 x uint8_t elements */
274fc54766bSSrikanth Yalavarthi 	u8x8 = vld1_u8(input);
275fc54766bSSrikanth Yalavarthi 
276fc54766bSSrikanth Yalavarthi 	/* widen uint8_t to uint16_t */
277fc54766bSSrikanth Yalavarthi 	u16x8 = vmovl_u8(u8x8);
278fc54766bSSrikanth Yalavarthi 
279fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
280fc54766bSSrikanth Yalavarthi 	u16x4 = vget_low_u16(u16x8);
281fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
282fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
283fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
284fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
285fc54766bSSrikanth Yalavarthi 
286fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
287fc54766bSSrikanth Yalavarthi 	u16x4 = vget_high_u16(u16x8);
288fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
289fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
290fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
291fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
292fc54766bSSrikanth Yalavarthi }
293fc54766bSSrikanth Yalavarthi 
294fc54766bSSrikanth Yalavarthi static inline void
295fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
296fc54766bSSrikanth Yalavarthi {
297fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
298fc54766bSSrikanth Yalavarthi }
299fc54766bSSrikanth Yalavarthi 
300fc54766bSSrikanth Yalavarthi int
301fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
302fc54766bSSrikanth Yalavarthi {
303fc54766bSSrikanth Yalavarthi 	uint8_t *input_buffer;
304fc54766bSSrikanth Yalavarthi 	float *output_buffer;
305fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
306fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
307fc54766bSSrikanth Yalavarthi 	uint64_t i;
308fc54766bSSrikanth Yalavarthi 
309fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
310fc54766bSSrikanth Yalavarthi 		return -EINVAL;
311fc54766bSSrikanth Yalavarthi 
312fc54766bSSrikanth Yalavarthi 	input_buffer = (uint8_t *)input;
313fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
314fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
315fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
316fc54766bSSrikanth Yalavarthi 
317fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
318fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
319fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
320fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
321fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
322fc54766bSSrikanth Yalavarthi 	}
323fc54766bSSrikanth Yalavarthi 
324fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
325fc54766bSSrikanth Yalavarthi 	i = i * vlen;
326fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
327fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
328fc54766bSSrikanth Yalavarthi 		input_buffer++;
329fc54766bSSrikanth Yalavarthi 		output_buffer++;
330fc54766bSSrikanth Yalavarthi 	}
331fc54766bSSrikanth Yalavarthi 
332fc54766bSSrikanth Yalavarthi 	return 0;
333fc54766bSSrikanth Yalavarthi }
334fc54766bSSrikanth Yalavarthi 
335fc54766bSSrikanth Yalavarthi static inline void
336fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
337fc54766bSSrikanth Yalavarthi {
338fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
339fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
340fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
341fc54766bSSrikanth Yalavarthi 
342fc54766bSSrikanth Yalavarthi 	/* load 4 x float elements */
343fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
344fc54766bSSrikanth Yalavarthi 
345fc54766bSSrikanth Yalavarthi 	/* scale */
346fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
347fc54766bSSrikanth Yalavarthi 
348fc54766bSSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
349fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
350fc54766bSSrikanth Yalavarthi 
351fc54766bSSrikanth Yalavarthi 	/* saturate narrow to int16x4_t */
352fc54766bSSrikanth Yalavarthi 	s16x4 = vqmovn_s32(s32x4);
353fc54766bSSrikanth Yalavarthi 
354fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
355fc54766bSSrikanth Yalavarthi 	vst1_s16(output, s16x4);
356fc54766bSSrikanth Yalavarthi }
357fc54766bSSrikanth Yalavarthi 
358fc54766bSSrikanth Yalavarthi static inline void
359fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
360fc54766bSSrikanth Yalavarthi {
361fc54766bSSrikanth Yalavarthi 	int32_t s32;
362fc54766bSSrikanth Yalavarthi 
363fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
364fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
365fc54766bSSrikanth Yalavarthi 
366fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
367fc54766bSSrikanth Yalavarthi 	*output = vqmovns_s32(s32);
368fc54766bSSrikanth Yalavarthi }
369fc54766bSSrikanth Yalavarthi 
370fc54766bSSrikanth Yalavarthi int
371fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
372fc54766bSSrikanth Yalavarthi {
373fc54766bSSrikanth Yalavarthi 	float *input_buffer;
374fc54766bSSrikanth Yalavarthi 	int16_t *output_buffer;
375fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
376fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
377fc54766bSSrikanth Yalavarthi 	uint64_t i;
378fc54766bSSrikanth Yalavarthi 
379fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
380fc54766bSSrikanth Yalavarthi 		return -EINVAL;
381fc54766bSSrikanth Yalavarthi 
382fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
383fc54766bSSrikanth Yalavarthi 	output_buffer = (int16_t *)output;
384fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
385fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
386fc54766bSSrikanth Yalavarthi 
387fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
388fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
389fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
390fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
391fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
392fc54766bSSrikanth Yalavarthi 	}
393fc54766bSSrikanth Yalavarthi 
394fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
395fc54766bSSrikanth Yalavarthi 	i = i * vlen;
396fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
397fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
398fc54766bSSrikanth Yalavarthi 		input_buffer++;
399fc54766bSSrikanth Yalavarthi 		output_buffer++;
400fc54766bSSrikanth Yalavarthi 	}
401fc54766bSSrikanth Yalavarthi 
402fc54766bSSrikanth Yalavarthi 	return 0;
403fc54766bSSrikanth Yalavarthi }
404fc54766bSSrikanth Yalavarthi 
405fc54766bSSrikanth Yalavarthi static inline void
406fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
407fc54766bSSrikanth Yalavarthi {
408fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
409fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
410fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
411fc54766bSSrikanth Yalavarthi 
412fc54766bSSrikanth Yalavarthi 	/* load 4 x int16_t elements */
413fc54766bSSrikanth Yalavarthi 	s16x4 = vld1_s16(input);
414fc54766bSSrikanth Yalavarthi 
415fc54766bSSrikanth Yalavarthi 	/* widen int16_t to int32_t */
416fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
417fc54766bSSrikanth Yalavarthi 
418fc54766bSSrikanth Yalavarthi 	/* convert int32_t to float */
419fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
420fc54766bSSrikanth Yalavarthi 
421fc54766bSSrikanth Yalavarthi 	/* scale */
422fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
423fc54766bSSrikanth Yalavarthi 
424fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
425fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
426fc54766bSSrikanth Yalavarthi }
427fc54766bSSrikanth Yalavarthi 
428fc54766bSSrikanth Yalavarthi static inline void
429fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
430fc54766bSSrikanth Yalavarthi {
431fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
432fc54766bSSrikanth Yalavarthi }
433fc54766bSSrikanth Yalavarthi 
434fc54766bSSrikanth Yalavarthi int
435fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
436fc54766bSSrikanth Yalavarthi {
437fc54766bSSrikanth Yalavarthi 	int16_t *input_buffer;
438fc54766bSSrikanth Yalavarthi 	float *output_buffer;
439fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
440fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
441fc54766bSSrikanth Yalavarthi 	uint64_t i;
442fc54766bSSrikanth Yalavarthi 
443fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
444fc54766bSSrikanth Yalavarthi 		return -EINVAL;
445fc54766bSSrikanth Yalavarthi 
446fc54766bSSrikanth Yalavarthi 	input_buffer = (int16_t *)input;
447fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
448fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
449fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
450fc54766bSSrikanth Yalavarthi 
451fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
452fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
453fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
454fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
455fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
456fc54766bSSrikanth Yalavarthi 	}
457fc54766bSSrikanth Yalavarthi 
458fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
459fc54766bSSrikanth Yalavarthi 	i = i * vlen;
460fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
461fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
462fc54766bSSrikanth Yalavarthi 		input_buffer++;
463fc54766bSSrikanth Yalavarthi 		output_buffer++;
464fc54766bSSrikanth Yalavarthi 	}
465fc54766bSSrikanth Yalavarthi 
466fc54766bSSrikanth Yalavarthi 	return 0;
467fc54766bSSrikanth Yalavarthi }
468fc54766bSSrikanth Yalavarthi 
469fc54766bSSrikanth Yalavarthi static inline void
470fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
471fc54766bSSrikanth Yalavarthi {
472fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
473fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
474fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
475fc54766bSSrikanth Yalavarthi 
476fc54766bSSrikanth Yalavarthi 	/* load 4 float elements */
477fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
478fc54766bSSrikanth Yalavarthi 
479fc54766bSSrikanth Yalavarthi 	/* scale */
480fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
481fc54766bSSrikanth Yalavarthi 
482fc54766bSSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
483fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
484fc54766bSSrikanth Yalavarthi 
485fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
486fc54766bSSrikanth Yalavarthi 	u16x4 = vqmovn_u32(u32x4);
487fc54766bSSrikanth Yalavarthi 
488fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
489fc54766bSSrikanth Yalavarthi 	vst1_u16(output, u16x4);
490fc54766bSSrikanth Yalavarthi }
491fc54766bSSrikanth Yalavarthi 
492fc54766bSSrikanth Yalavarthi static inline void
493fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
494fc54766bSSrikanth Yalavarthi {
495fc54766bSSrikanth Yalavarthi 	uint32_t u32;
496fc54766bSSrikanth Yalavarthi 
497fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
498fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
499fc54766bSSrikanth Yalavarthi 
500fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
501fc54766bSSrikanth Yalavarthi 	*output = vqmovns_u32(u32);
502fc54766bSSrikanth Yalavarthi }
503fc54766bSSrikanth Yalavarthi 
504fc54766bSSrikanth Yalavarthi int
505fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
506fc54766bSSrikanth Yalavarthi {
507fc54766bSSrikanth Yalavarthi 	float *input_buffer;
508fc54766bSSrikanth Yalavarthi 	uint16_t *output_buffer;
509fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
510fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
511fc54766bSSrikanth Yalavarthi 	uint64_t i;
512fc54766bSSrikanth Yalavarthi 
513fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
514fc54766bSSrikanth Yalavarthi 		return -EINVAL;
515fc54766bSSrikanth Yalavarthi 
516fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
517fc54766bSSrikanth Yalavarthi 	output_buffer = (uint16_t *)output;
518fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
519fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
520fc54766bSSrikanth Yalavarthi 
521fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
522fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
523fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
524fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
525fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
526fc54766bSSrikanth Yalavarthi 	}
527fc54766bSSrikanth Yalavarthi 
528fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
529fc54766bSSrikanth Yalavarthi 	i = i * vlen;
530fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
531fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
532fc54766bSSrikanth Yalavarthi 		input_buffer++;
533fc54766bSSrikanth Yalavarthi 		output_buffer++;
534fc54766bSSrikanth Yalavarthi 	}
535fc54766bSSrikanth Yalavarthi 
536fc54766bSSrikanth Yalavarthi 	return 0;
537fc54766bSSrikanth Yalavarthi }
538fc54766bSSrikanth Yalavarthi 
539fc54766bSSrikanth Yalavarthi static inline void
540fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
541fc54766bSSrikanth Yalavarthi {
542fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
543fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
544fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
545fc54766bSSrikanth Yalavarthi 
546fc54766bSSrikanth Yalavarthi 	/* load 4 x uint16_t elements */
547fc54766bSSrikanth Yalavarthi 	u16x4 = vld1_u16(input);
548fc54766bSSrikanth Yalavarthi 
549fc54766bSSrikanth Yalavarthi 	/* widen uint16_t to uint32_t */
550fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
551fc54766bSSrikanth Yalavarthi 
552fc54766bSSrikanth Yalavarthi 	/* convert uint32_t to float */
553fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
554fc54766bSSrikanth Yalavarthi 
555fc54766bSSrikanth Yalavarthi 	/* scale */
556fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
557fc54766bSSrikanth Yalavarthi 
558fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
559fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
560fc54766bSSrikanth Yalavarthi }
561fc54766bSSrikanth Yalavarthi 
562fc54766bSSrikanth Yalavarthi static inline void
563fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
564fc54766bSSrikanth Yalavarthi {
565fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
566fc54766bSSrikanth Yalavarthi }
567fc54766bSSrikanth Yalavarthi 
568fc54766bSSrikanth Yalavarthi int
569fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
570fc54766bSSrikanth Yalavarthi {
571fc54766bSSrikanth Yalavarthi 	uint16_t *input_buffer;
572fc54766bSSrikanth Yalavarthi 	float *output_buffer;
573fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
574fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
575fc54766bSSrikanth Yalavarthi 	uint64_t i;
576fc54766bSSrikanth Yalavarthi 
577fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
578fc54766bSSrikanth Yalavarthi 		return -EINVAL;
579fc54766bSSrikanth Yalavarthi 
580fc54766bSSrikanth Yalavarthi 	input_buffer = (uint16_t *)input;
581fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
582fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
583fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
584fc54766bSSrikanth Yalavarthi 
585fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
586fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
587fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
588fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
589fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
590fc54766bSSrikanth Yalavarthi 	}
591fc54766bSSrikanth Yalavarthi 
592fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
593fc54766bSSrikanth Yalavarthi 	i = i * vlen;
594fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
595fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
596fc54766bSSrikanth Yalavarthi 		input_buffer++;
597fc54766bSSrikanth Yalavarthi 		output_buffer++;
598fc54766bSSrikanth Yalavarthi 	}
599fc54766bSSrikanth Yalavarthi 
600fc54766bSSrikanth Yalavarthi 	return 0;
601fc54766bSSrikanth Yalavarthi }
602fc54766bSSrikanth Yalavarthi 
603fc54766bSSrikanth Yalavarthi static inline void
604fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
605fc54766bSSrikanth Yalavarthi {
606fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
607fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
608fc54766bSSrikanth Yalavarthi 
609fc54766bSSrikanth Yalavarthi 	/* load 4 x float32_t elements */
610fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
611fc54766bSSrikanth Yalavarthi 
612fc54766bSSrikanth Yalavarthi 	/* convert to float16x4_t */
613fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
614fc54766bSSrikanth Yalavarthi 
615fc54766bSSrikanth Yalavarthi 	/* store float16x4_t */
616fc54766bSSrikanth Yalavarthi 	vst1_f16(output, f16x4);
617fc54766bSSrikanth Yalavarthi }
618fc54766bSSrikanth Yalavarthi 
619fc54766bSSrikanth Yalavarthi static inline void
620fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
621fc54766bSSrikanth Yalavarthi {
622fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
623fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
624fc54766bSSrikanth Yalavarthi 
625fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
626fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_dup_f32(input);
627fc54766bSSrikanth Yalavarthi 
628fc54766bSSrikanth Yalavarthi 	/* convert float32_t to float16_t */
629fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
630fc54766bSSrikanth Yalavarthi 
631fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
632fc54766bSSrikanth Yalavarthi 	vst1_lane_f16(output, f16x4, 0);
633fc54766bSSrikanth Yalavarthi }
634fc54766bSSrikanth Yalavarthi 
635fc54766bSSrikanth Yalavarthi int
636fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
637fc54766bSSrikanth Yalavarthi {
638fc54766bSSrikanth Yalavarthi 	float32_t *input_buffer;
639fc54766bSSrikanth Yalavarthi 	float16_t *output_buffer;
640fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
641fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
642fc54766bSSrikanth Yalavarthi 	uint64_t i;
643fc54766bSSrikanth Yalavarthi 
644fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
645fc54766bSSrikanth Yalavarthi 		return -EINVAL;
646fc54766bSSrikanth Yalavarthi 
647fc54766bSSrikanth Yalavarthi 	input_buffer = (float32_t *)input;
648fc54766bSSrikanth Yalavarthi 	output_buffer = (float16_t *)output;
649fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
650fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
651fc54766bSSrikanth Yalavarthi 
652fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
653fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
654fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
655fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
656fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
657fc54766bSSrikanth Yalavarthi 	}
658fc54766bSSrikanth Yalavarthi 
659fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
660fc54766bSSrikanth Yalavarthi 	i = i * vlen;
661fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
662fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
663fc54766bSSrikanth Yalavarthi 		input_buffer++;
664fc54766bSSrikanth Yalavarthi 		output_buffer++;
665fc54766bSSrikanth Yalavarthi 	}
666fc54766bSSrikanth Yalavarthi 
667fc54766bSSrikanth Yalavarthi 	return 0;
668fc54766bSSrikanth Yalavarthi }
669fc54766bSSrikanth Yalavarthi 
670fc54766bSSrikanth Yalavarthi static inline void
671fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
672fc54766bSSrikanth Yalavarthi {
673fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
674fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
675fc54766bSSrikanth Yalavarthi 
676fc54766bSSrikanth Yalavarthi 	/* load 4 x float16_t elements */
677fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_f16(input);
678fc54766bSSrikanth Yalavarthi 
679fc54766bSSrikanth Yalavarthi 	/* convert float16x4_t to float32x4_t */
680fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
681fc54766bSSrikanth Yalavarthi 
682fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
683fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
684fc54766bSSrikanth Yalavarthi }
685fc54766bSSrikanth Yalavarthi 
686fc54766bSSrikanth Yalavarthi static inline void
687fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
688fc54766bSSrikanth Yalavarthi {
689fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
690fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
691fc54766bSSrikanth Yalavarthi 
692fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
693fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_dup_f16(input);
694fc54766bSSrikanth Yalavarthi 
695fc54766bSSrikanth Yalavarthi 	/* convert float16_t to float32_t */
696fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
697fc54766bSSrikanth Yalavarthi 
698fc54766bSSrikanth Yalavarthi 	/* store 1 element */
699fc54766bSSrikanth Yalavarthi 	vst1q_lane_f32(output, f32x4, 0);
700fc54766bSSrikanth Yalavarthi }
701fc54766bSSrikanth Yalavarthi 
702fc54766bSSrikanth Yalavarthi int
703fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
704fc54766bSSrikanth Yalavarthi {
705fc54766bSSrikanth Yalavarthi 	float16_t *input_buffer;
706fc54766bSSrikanth Yalavarthi 	float32_t *output_buffer;
707fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
708fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
709fc54766bSSrikanth Yalavarthi 	uint64_t i;
710fc54766bSSrikanth Yalavarthi 
711fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
712fc54766bSSrikanth Yalavarthi 		return -EINVAL;
713fc54766bSSrikanth Yalavarthi 
714fc54766bSSrikanth Yalavarthi 	input_buffer = (float16_t *)input;
715fc54766bSSrikanth Yalavarthi 	output_buffer = (float32_t *)output;
716fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
717fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
718fc54766bSSrikanth Yalavarthi 
719fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
720fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
721fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
722fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
723fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
724fc54766bSSrikanth Yalavarthi 	}
725fc54766bSSrikanth Yalavarthi 
726fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
727fc54766bSSrikanth Yalavarthi 	i = i * vlen;
728fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
729fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
730fc54766bSSrikanth Yalavarthi 		input_buffer++;
731fc54766bSSrikanth Yalavarthi 		output_buffer++;
732fc54766bSSrikanth Yalavarthi 	}
733fc54766bSSrikanth Yalavarthi 
734fc54766bSSrikanth Yalavarthi 	return 0;
735fc54766bSSrikanth Yalavarthi }
736