xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision 65282e9f8e118a4ca977d1aee2d7f51f44e9bc1b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2022 Marvell.
3  */
4 
5 #include <errno.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 
9 #include "mldev_utils.h"
10 
11 #include <arm_neon.h>
12 
13 /* Description:
14  * This file implements vector versions of Machine Learning utility functions used to convert data
15  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16  * is based on Arm Neon intrinsics.
17  */
18 
19 static inline void
20 __float32_to_int8_neon_s8x8(const float *input, int8_t *output, float scale, int8_t zero_point)
21 {
22 	int16x4_t s16x4_l;
23 	int16x4_t s16x4_h;
24 	float32x4_t f32x4;
25 	int16x8_t s16x8;
26 	int32x4_t s32x4;
27 	int8x8_t s8x8;
28 
29 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30 	 * Use round to nearest with ties away rounding mode.
31 	 */
32 	f32x4 = vld1q_f32(input);
33 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
34 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
35 	s32x4 = vcvtaq_s32_f32(f32x4);
36 	s16x4_l = vqmovn_s32(s32x4);
37 
38 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
39 	 * Use round to nearest with ties away rounding mode.
40 	 */
41 	f32x4 = vld1q_f32(input + 4);
42 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
43 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
44 	s32x4 = vcvtaq_s32_f32(f32x4);
45 	s16x4_h = vqmovn_s32(s32x4);
46 
47 	/* combine lower and higher int16x4_t to int16x8_t */
48 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
49 
50 	/* narrow to int8_t */
51 	s8x8 = vqmovn_s16(s16x8);
52 	s8x8 = vmax_s8(s8x8, vdup_n_s8(INT8_MIN + 1));
53 
54 	/* store 8 elements */
55 	vst1_s8(output, s8x8);
56 }
57 
58 static inline void
59 __float32_to_int8_neon_s8x1(const float *input, int8_t *output, float scale, int8_t zero_point)
60 {
61 	float32x2_t f32x2;
62 	int32x2_t s32x2;
63 	int16_t s16;
64 
65 	/* scale and convert, round to nearest with ties away rounding mode */
66 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
67 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
68 	s32x2 = vcvta_s32_f32(f32x2);
69 	s32x2 = vmax_s32(s32x2, vdup_n_s32(INT8_MIN + 1));
70 
71 	/* saturate narrow */
72 	s16 = vqmovns_s32(vget_lane_s32(s32x2, 0));
73 
74 	/* convert to int8_t */
75 	*output = vqmovnh_s16(s16);
76 }
77 
78 int
79 rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale,
80 			  int8_t zero_point)
81 {
82 	const float *input_buffer;
83 	int8_t *output_buffer;
84 	uint64_t nb_iterations;
85 	uint32_t vlen;
86 	uint64_t i;
87 
88 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
89 		return -EINVAL;
90 
91 	input_buffer = (const float *)input;
92 	output_buffer = (int8_t *)output;
93 	vlen = 2 * sizeof(float) / sizeof(int8_t);
94 	nb_iterations = nb_elements / vlen;
95 
96 	/* convert vlen elements in each iteration */
97 	for (i = 0; i < nb_iterations; i++) {
98 		__float32_to_int8_neon_s8x8(input_buffer, output_buffer, scale, zero_point);
99 		input_buffer += vlen;
100 		output_buffer += vlen;
101 	}
102 
103 	/* convert leftover elements */
104 	i = i * vlen;
105 	for (; i < nb_elements; i++) {
106 		__float32_to_int8_neon_s8x1(input_buffer, output_buffer, scale, zero_point);
107 		input_buffer++;
108 		output_buffer++;
109 	}
110 
111 	return 0;
112 }
113 
114 static inline void
115 __int8_to_float32_neon_f32x8(const int8_t *input, float *output, float scale, int8_t zero_point)
116 {
117 	float32x4_t f32x4;
118 	int16x8_t s16x8;
119 	int16x4_t s16x4;
120 	int32x4_t s32x4;
121 	int8x8_t s8x8;
122 
123 	/* load 8 x int8_t elements */
124 	s8x8 = vld1_s8(input);
125 
126 	/* widen int8_t to int16_t */
127 	s16x8 = vmovl_s8(s8x8);
128 
129 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
130 	s16x4 = vget_low_s16(s16x8);
131 	s32x4 = vmovl_s16(s16x4);
132 	f32x4 = vcvtq_f32_s32(s32x4);
133 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
134 	f32x4 = vmulq_n_f32(f32x4, scale);
135 	vst1q_f32(output, f32x4);
136 
137 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
138 	s16x4 = vget_high_s16(s16x8);
139 	s32x4 = vmovl_s16(s16x4);
140 	f32x4 = vcvtq_f32_s32(s32x4);
141 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
142 	f32x4 = vmulq_n_f32(f32x4, scale);
143 	vst1q_f32(output + 4, f32x4);
144 }
145 
146 static inline void
147 __int8_to_float32_neon_f32x1(const int8_t *input, float *output, float scale, int8_t zero_point)
148 {
149 	*output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point);
150 }
151 
152 int
153 rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
154 			  int8_t zero_point)
155 {
156 	const int8_t *input_buffer;
157 	float *output_buffer;
158 	uint64_t nb_iterations;
159 	uint32_t vlen;
160 	uint64_t i;
161 
162 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
163 		return -EINVAL;
164 
165 	input_buffer = (const int8_t *)input;
166 	output_buffer = (float *)output;
167 	vlen = 2 * sizeof(float) / sizeof(int8_t);
168 	nb_iterations = nb_elements / vlen;
169 
170 	/* convert vlen elements in each iteration */
171 	for (i = 0; i < nb_iterations; i++) {
172 		__int8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point);
173 		input_buffer += vlen;
174 		output_buffer += vlen;
175 	}
176 
177 	/* convert leftover elements */
178 	i = i * vlen;
179 	for (; i < nb_elements; i++) {
180 		__int8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
181 		input_buffer++;
182 		output_buffer++;
183 	}
184 
185 	return 0;
186 }
187 
188 static inline void
189 __float32_to_uint8_neon_u8x8(const float *input, uint8_t *output, float scale, uint8_t zero_point)
190 {
191 	uint16x4_t u16x4_l;
192 	uint16x4_t u16x4_h;
193 	float32x4_t f32x4;
194 	uint32x4_t u32x4;
195 	uint16x8_t u16x8;
196 	uint8x8_t u8x8;
197 
198 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
199 	 * use round to nearest with ties away rounding mode.
200 	 */
201 	f32x4 = vld1q_f32(input);
202 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
203 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
204 	u32x4 = vcvtaq_u32_f32(f32x4);
205 	u16x4_l = vqmovn_u32(u32x4);
206 
207 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
208 	 * use round to nearest with ties away rounding mode.
209 	 */
210 	f32x4 = vld1q_f32(input + 4);
211 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
212 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
213 	u32x4 = vcvtaq_u32_f32(f32x4);
214 	u16x4_h = vqmovn_u32(u32x4);
215 
216 	/* combine lower and higher uint16x4_t */
217 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
218 
219 	/* narrow to uint8x8_t */
220 	u8x8 = vqmovn_u16(u16x8);
221 
222 	/* store 8 elements */
223 	vst1_u8(output, u8x8);
224 }
225 
226 static inline void
227 __float32_to_uint8_neon_u8x1(const float *input, uint8_t *output, float scale, uint8_t zero_point)
228 {
229 	float32x2_t f32x2;
230 	uint32x2_t u32x2;
231 	uint16_t u16;
232 
233 	/* scale and convert, round to nearest with ties away rounding mode */
234 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
235 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
236 	u32x2 = vcvta_u32_f32(f32x2);
237 
238 	/* saturate narrow */
239 	u16 = vqmovns_u32(vget_lane_u32(u32x2, 0));
240 
241 	/* convert to uint8_t */
242 	*output = vqmovnh_u16(u16);
243 }
244 
245 int
246 rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale,
247 			   uint8_t zero_point)
248 {
249 	const float *input_buffer;
250 	uint8_t *output_buffer;
251 	uint64_t nb_iterations;
252 	uint32_t vlen;
253 	uint64_t i;
254 
255 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
256 		return -EINVAL;
257 
258 	input_buffer = (const float *)input;
259 	output_buffer = (uint8_t *)output;
260 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
261 	nb_iterations = nb_elements / vlen;
262 
263 	/* convert vlen elements in each iteration */
264 	for (i = 0; i < nb_iterations; i++) {
265 		__float32_to_uint8_neon_u8x8(input_buffer, output_buffer, scale, zero_point);
266 		input_buffer += vlen;
267 		output_buffer += vlen;
268 	}
269 
270 	/* convert leftover elements */
271 	i = i * vlen;
272 	for (; i < nb_elements; i++) {
273 		__float32_to_uint8_neon_u8x1(input_buffer, output_buffer, scale, zero_point);
274 		input_buffer++;
275 		output_buffer++;
276 	}
277 
278 	return 0;
279 }
280 
281 static inline void
282 __uint8_to_float32_neon_f32x8(const uint8_t *input, float *output, float scale, uint8_t zero_point)
283 {
284 	float32x4_t f32x4;
285 	uint16x8_t u16x8;
286 	int16x8_t s16x8;
287 	int16x4_t s16x4;
288 	int32x4_t s32x4;
289 	uint8x8_t u8x8;
290 
291 	/* load 8 x uint8_t elements */
292 	u8x8 = vld1_u8(input);
293 	u16x8 = vmovl_u8(u8x8);
294 	s16x8 = vreinterpretq_s16_u16(u16x8);
295 
296 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
297 	s16x4 = vget_low_s16(s16x8);
298 	s32x4 = vmovl_s16(s16x4);
299 	f32x4 = vcvtq_f32_s32(s32x4);
300 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
301 	f32x4 = vmulq_n_f32(f32x4, scale);
302 	vst1q_f32(output, f32x4);
303 
304 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
305 	s16x4 = vget_high_s16(s16x8);
306 	s32x4 = vmovl_s16(s16x4);
307 	f32x4 = vcvtq_f32_s32(s32x4);
308 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
309 	f32x4 = vmulq_n_f32(f32x4, scale);
310 	vst1q_f32(output + 4, f32x4);
311 }
312 
313 static inline void
314 __uint8_to_float32_neon_f32x1(const uint8_t *input, float *output, float scale, uint8_t zero_point)
315 {
316 	*output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point);
317 }
318 
319 int
320 rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
321 			   uint8_t zero_point)
322 {
323 	const uint8_t *input_buffer;
324 	float *output_buffer;
325 	uint64_t nb_iterations;
326 	uint64_t vlen;
327 	uint64_t i;
328 
329 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
330 		return -EINVAL;
331 
332 	input_buffer = (const uint8_t *)input;
333 	output_buffer = (float *)output;
334 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
335 	nb_iterations = nb_elements / vlen;
336 
337 	/* convert vlen elements in each iteration */
338 	for (i = 0; i < nb_iterations; i++) {
339 		__uint8_to_float32_neon_f32x8(input_buffer, output_buffer, scale, zero_point);
340 		input_buffer += vlen;
341 		output_buffer += vlen;
342 	}
343 
344 	/* convert leftover elements */
345 	i = i * vlen;
346 	for (; i < nb_elements; i++) {
347 		__uint8_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
348 		input_buffer++;
349 		output_buffer++;
350 	}
351 
352 	return 0;
353 }
354 
355 static inline void
356 __float32_to_int16_neon_s16x4(const float *input, int16_t *output, float scale, int16_t zero_point)
357 {
358 	float32x4_t f32x4;
359 	int16x4_t s16x4;
360 	int32x4_t s32x4;
361 
362 	/* load 4 x float elements */
363 	f32x4 = vld1q_f32(input);
364 
365 	/* scale */
366 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
367 
368 	/* add zero point */
369 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
370 
371 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
372 	s32x4 = vcvtaq_s32_f32(f32x4);
373 
374 	/* saturate narrow to int16x4_t */
375 	s16x4 = vqmovn_s32(s32x4);
376 	s16x4 = vmax_s16(s16x4, vdup_n_s16(INT16_MIN + 1));
377 
378 	/* store 4 elements */
379 	vst1_s16(output, s16x4);
380 }
381 
382 static inline void
383 __float32_to_int16_neon_s16x1(const float *input, int16_t *output, float scale, int16_t zero_point)
384 {
385 	float32x2_t f32x2;
386 	int32x2_t s32x2;
387 
388 	/* scale and convert, round to nearest with ties away rounding mode */
389 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
390 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
391 	s32x2 = vcvta_s32_f32(f32x2);
392 	s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1));
393 
394 	/* saturate narrow */
395 	*output = vqmovns_s32(vget_lane_s32(s32x2, 0));
396 }
397 
398 int
399 rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale,
400 			   int16_t zero_point)
401 {
402 	const float *input_buffer;
403 	int16_t *output_buffer;
404 	uint64_t nb_iterations;
405 	uint32_t vlen;
406 	uint64_t i;
407 
408 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
409 		return -EINVAL;
410 
411 	input_buffer = (const float *)input;
412 	output_buffer = (int16_t *)output;
413 	vlen = 2 * sizeof(float) / sizeof(int16_t);
414 	nb_iterations = nb_elements / vlen;
415 
416 	/* convert vlen elements in each iteration */
417 	for (i = 0; i < nb_iterations; i++) {
418 		__float32_to_int16_neon_s16x4(input_buffer, output_buffer, scale, zero_point);
419 		input_buffer += vlen;
420 		output_buffer += vlen;
421 	}
422 
423 	/* convert leftover elements */
424 	i = i * vlen;
425 	for (; i < nb_elements; i++) {
426 		__float32_to_int16_neon_s16x1(input_buffer, output_buffer, scale, zero_point);
427 		input_buffer++;
428 		output_buffer++;
429 	}
430 
431 	return 0;
432 }
433 
434 static inline void
435 __int16_to_float32_neon_f32x4(const int16_t *input, float *output, float scale, int16_t zero_point)
436 {
437 	float32x4_t f32x4;
438 	int16x4_t s16x4;
439 	int32x4_t s32x4;
440 
441 	/* load 4 x int16_t elements */
442 	s16x4 = vld1_s16(input);
443 
444 	/* widen int16_t to int32_t */
445 	s32x4 = vmovl_s16(s16x4);
446 
447 	/* convert int32_t to float */
448 	f32x4 = vcvtq_f32_s32(s32x4);
449 
450 	/* subtract zero point */
451 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
452 
453 	/* scale */
454 	f32x4 = vmulq_n_f32(f32x4, scale);
455 
456 	/* store float32x4_t */
457 	vst1q_f32(output, f32x4);
458 }
459 
460 static inline void
461 __int16_to_float32_neon_f32x1(const int16_t *input, float *output, float scale, int16_t zero_point)
462 {
463 	*output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point);
464 }
465 
466 int
467 rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
468 			   int16_t zero_point)
469 {
470 	const int16_t *input_buffer;
471 	float *output_buffer;
472 	uint64_t nb_iterations;
473 	uint32_t vlen;
474 	uint64_t i;
475 
476 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
477 		return -EINVAL;
478 
479 	input_buffer = (const int16_t *)input;
480 	output_buffer = (float *)output;
481 	vlen = 2 * sizeof(float) / sizeof(int16_t);
482 	nb_iterations = nb_elements / vlen;
483 
484 	/* convert vlen elements in each iteration */
485 	for (i = 0; i < nb_iterations; i++) {
486 		__int16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
487 		input_buffer += vlen;
488 		output_buffer += vlen;
489 	}
490 
491 	/* convert leftover elements */
492 	i = i * vlen;
493 	for (; i < nb_elements; i++) {
494 		__int16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
495 		input_buffer++;
496 		output_buffer++;
497 	}
498 
499 	return 0;
500 }
501 
502 static inline void
503 __float32_to_uint16_neon_u16x4(const float *input, uint16_t *output, float scale,
504 			       uint16_t zero_point)
505 {
506 	float32x4_t f32x4;
507 	uint16x4_t u16x4;
508 	uint32x4_t u32x4;
509 
510 	/* load 4 float elements */
511 	f32x4 = vld1q_f32(input);
512 
513 	/* scale */
514 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
515 
516 	/* add zero point */
517 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
518 
519 	/* convert using round to nearest with ties to away rounding mode */
520 	u32x4 = vcvtaq_u32_f32(f32x4);
521 
522 	/* saturate narrow */
523 	u16x4 = vqmovn_u32(u32x4);
524 
525 	/* store 4 elements */
526 	vst1_u16(output, u16x4);
527 }
528 
529 static inline void
530 __float32_to_uint16_neon_u16x1(const float *input, uint16_t *output, float scale,
531 			       uint16_t zero_point)
532 {
533 	uint32_t u32;
534 
535 	/* scale and convert, round to nearest with ties away rounding mode */
536 	u32 = vcvtas_u32_f32((*input) / scale + (float)zero_point);
537 
538 	/* saturate narrow */
539 	*output = vqmovns_u32(u32) + zero_point;
540 }
541 
542 int
543 rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale,
544 			   uint16_t zero_point)
545 {
546 	const float *input_buffer;
547 	uint16_t *output_buffer;
548 	uint64_t nb_iterations;
549 	uint64_t vlen;
550 	uint64_t i;
551 
552 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
553 		return -EINVAL;
554 
555 	input_buffer = (const float *)input;
556 	output_buffer = (uint16_t *)output;
557 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
558 	nb_iterations = nb_elements / vlen;
559 
560 	/* convert vlen elements in each iteration */
561 	for (i = 0; i < nb_iterations; i++) {
562 		__float32_to_uint16_neon_u16x4(input_buffer, output_buffer, scale, zero_point);
563 		input_buffer += vlen;
564 		output_buffer += vlen;
565 	}
566 
567 	/* convert leftover elements */
568 	i = i * vlen;
569 	for (; i < nb_elements; i++) {
570 		__float32_to_uint16_neon_u16x1(input_buffer, output_buffer, scale, zero_point);
571 		input_buffer++;
572 		output_buffer++;
573 	}
574 
575 	return 0;
576 }
577 
578 static inline void
579 __uint16_to_float32_neon_f32x4(const uint16_t *input, float *output, float scale,
580 			       uint16_t zero_point)
581 {
582 	float32x4_t f32x4;
583 	uint16x4_t u16x4;
584 	uint32x4_t u32x4;
585 
586 	/* load 4 x uint16_t elements */
587 	u16x4 = vld1_u16(input);
588 
589 	/* widen uint16_t to uint32_t */
590 	u32x4 = vmovl_u16(u16x4);
591 
592 	/* convert uint32_t to float */
593 	f32x4 = vcvtq_f32_u32(u32x4);
594 
595 	/* subtract zero point */
596 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
597 
598 	/* scale */
599 	f32x4 = vmulq_n_f32(f32x4, scale);
600 
601 	/* store float32x4_t */
602 	vst1q_f32(output, f32x4);
603 }
604 
605 static inline void
606 __uint16_to_float32_neon_f32x1(const uint16_t *input, float *output, float scale,
607 			       uint16_t zero_point)
608 {
609 	*output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point);
610 }
611 
612 int
613 rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
614 			   uint16_t zero_point)
615 {
616 	const uint16_t *input_buffer;
617 	float *output_buffer;
618 	uint64_t nb_iterations;
619 	uint32_t vlen;
620 	uint64_t i;
621 
622 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
623 		return -EINVAL;
624 
625 	input_buffer = (const uint16_t *)input;
626 	output_buffer = (float *)output;
627 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
628 	nb_iterations = nb_elements / vlen;
629 
630 	/* convert vlen elements in each iteration */
631 	for (i = 0; i < nb_iterations; i++) {
632 		__uint16_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
633 		input_buffer += vlen;
634 		output_buffer += vlen;
635 	}
636 
637 	/* convert leftover elements */
638 	i = i * vlen;
639 	for (; i < nb_elements; i++) {
640 		__uint16_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
641 		input_buffer++;
642 		output_buffer++;
643 	}
644 
645 	return 0;
646 }
647 
648 static inline void
649 __float32_to_int32_neon_s32x4(const float *input, int32_t *output, float scale, int32_t zero_point)
650 {
651 	float32x4_t f32x4;
652 	int32x4_t s32x4;
653 
654 	/* load 4 x float elements */
655 	f32x4 = vld1q_f32(input);
656 
657 	/* scale */
658 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
659 
660 	/* add zero point */
661 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
662 
663 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
664 	s32x4 = vcvtaq_s32_f32(f32x4);
665 
666 	/* add zero_point */
667 	s32x4 = vaddq_s32(s32x4, vdupq_n_s32(zero_point));
668 	s32x4 = vmaxq_s32(s32x4, vdupq_n_s32(INT32_MIN + 1));
669 
670 	/* store 4 elements */
671 	vst1q_s32(output, s32x4);
672 }
673 
674 static inline void
675 __float32_to_int32_neon_s32x1(const float *input, int32_t *output, float scale, int32_t zero_point)
676 {
677 	float32x2_t f32x2;
678 	int32x2_t s32x2;
679 
680 	/* scale and convert, round to nearest with ties away rounding mode */
681 	f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
682 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
683 	s32x2 = vcvta_s32_f32(f32x2);
684 	s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1));
685 
686 	/* saturate narrow */
687 	vst1_lane_s32(output, s32x2, 0);
688 }
689 
690 int
691 rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale,
692 			   int32_t zero_point)
693 {
694 	const float *input_buffer;
695 	int32_t *output_buffer;
696 	uint64_t nb_iterations;
697 	uint32_t vlen;
698 	uint64_t i;
699 
700 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
701 		return -EINVAL;
702 
703 	input_buffer = (const float *)input;
704 	output_buffer = (int32_t *)output;
705 	vlen = 2 * sizeof(float) / sizeof(int32_t);
706 	nb_iterations = nb_elements / vlen;
707 
708 	/* convert vlen elements in each iteration */
709 	for (i = 0; i < nb_iterations; i++) {
710 		__float32_to_int32_neon_s32x4(input_buffer, output_buffer, scale, zero_point);
711 		input_buffer += vlen;
712 		output_buffer += vlen;
713 	}
714 
715 	/* convert leftover elements */
716 	i = i * vlen;
717 	for (; i < nb_elements; i++) {
718 		__float32_to_int32_neon_s32x1(input_buffer, output_buffer, scale, zero_point);
719 		input_buffer++;
720 		output_buffer++;
721 	}
722 
723 	return 0;
724 }
725 
726 static inline void
727 __int32_to_float32_neon_f32x4(const int32_t *input, float *output, float scale, int32_t zero_point)
728 {
729 	float32x4_t f32x4;
730 	int32x4_t s32x4;
731 
732 	/* load 4 x int32_t elements */
733 	s32x4 = vld1q_s32(input);
734 
735 	/* convert int32_t to float */
736 	f32x4 = vcvtq_f32_s32(s32x4);
737 
738 	/* subtract zero point */
739 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
740 
741 	/* scale */
742 	f32x4 = vmulq_n_f32(f32x4, scale);
743 
744 	/* store float32x4_t */
745 	vst1q_f32(output, f32x4);
746 }
747 
748 static inline void
749 __int32_to_float32_neon_f32x1(const int32_t *input, float *output, float scale, int32_t zero_point)
750 {
751 	*output = scale * (vcvts_f32_s32(*input) - (float)zero_point);
752 }
753 
754 int
755 rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
756 			   int32_t zero_point)
757 {
758 	const int32_t *input_buffer;
759 	float *output_buffer;
760 	uint64_t nb_iterations;
761 	uint32_t vlen;
762 	uint64_t i;
763 
764 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
765 		return -EINVAL;
766 
767 	input_buffer = (const int32_t *)input;
768 	output_buffer = (float *)output;
769 	vlen = 2 * sizeof(float) / sizeof(int32_t);
770 	nb_iterations = nb_elements / vlen;
771 
772 	/* convert vlen elements in each iteration */
773 	for (i = 0; i < nb_iterations; i++) {
774 		__int32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
775 		input_buffer += vlen;
776 		output_buffer += vlen;
777 	}
778 
779 	/* convert leftover elements */
780 	i = i * vlen;
781 	for (; i < nb_elements; i++) {
782 		__int32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
783 		input_buffer++;
784 		output_buffer++;
785 	}
786 
787 	return 0;
788 }
789 
790 static inline void
791 __float32_to_uint32_neon_u32x4(const float *input, uint32_t *output, float scale,
792 			       uint32_t zero_point)
793 {
794 	float32x4_t f32x4;
795 	uint32x4_t u32x4;
796 
797 	/* load 4 float elements */
798 	f32x4 = vld1q_f32(input);
799 
800 	/* scale */
801 	f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
802 
803 	/* add zero point */
804 	f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
805 
806 	/* convert using round to nearest with ties to away rounding mode */
807 	u32x4 = vcvtaq_u32_f32(f32x4);
808 
809 	/* store 4 elements */
810 	vst1q_u32(output, u32x4);
811 }
812 
813 static inline void
814 __float32_to_uint32_neon_u32x1(const float *input, uint32_t *output, float scale,
815 			       uint32_t zero_point)
816 {
817 	/* scale and convert, round to nearest with ties away rounding mode */
818 	*output = vcvtas_u32_f32((*input) / scale + (float)zero_point);
819 }
820 
821 int
822 rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale,
823 			   uint32_t zero_point)
824 {
825 	const float *input_buffer;
826 	uint32_t *output_buffer;
827 	uint64_t nb_iterations;
828 	uint64_t vlen;
829 	uint64_t i;
830 
831 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
832 		return -EINVAL;
833 
834 	input_buffer = (const float *)input;
835 	output_buffer = (uint32_t *)output;
836 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
837 	nb_iterations = nb_elements / vlen;
838 
839 	/* convert vlen elements in each iteration */
840 	for (i = 0; i < nb_iterations; i++) {
841 		__float32_to_uint32_neon_u32x4(input_buffer, output_buffer, scale, zero_point);
842 		input_buffer += vlen;
843 		output_buffer += vlen;
844 	}
845 
846 	/* convert leftover elements */
847 	i = i * vlen;
848 	for (; i < nb_elements; i++) {
849 		__float32_to_uint32_neon_u32x1(input_buffer, output_buffer, scale, zero_point);
850 		input_buffer++;
851 		output_buffer++;
852 	}
853 
854 	return 0;
855 }
856 
857 static inline void
858 __uint32_to_float32_neon_f32x4(const uint32_t *input, float *output, float scale,
859 			       uint32_t zero_point)
860 {
861 	float32x4_t f32x4;
862 	uint32x4_t u32x4;
863 
864 	/* load 4 x uint32_t elements */
865 	u32x4 = vld1q_u32(input);
866 
867 	/* convert uint32_t to float */
868 	f32x4 = vcvtq_f32_u32(u32x4);
869 
870 	/* subtract zero point */
871 	f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
872 
873 	/* scale */
874 	f32x4 = vmulq_n_f32(f32x4, scale);
875 
876 	/* store float32x4_t */
877 	vst1q_f32(output, f32x4);
878 }
879 
880 static inline void
881 __uint32_to_float32_neon_f32x1(const uint32_t *input, float *output, float scale,
882 			       uint32_t zero_point)
883 {
884 	*output = scale * (vcvts_f32_u32(*input) - (float)zero_point);
885 }
886 
887 int
888 rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
889 			   uint32_t zero_point)
890 {
891 	const uint32_t *input_buffer;
892 	float *output_buffer;
893 	uint64_t nb_iterations;
894 	uint32_t vlen;
895 	uint64_t i;
896 
897 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
898 		return -EINVAL;
899 
900 	input_buffer = (const uint32_t *)input;
901 	output_buffer = (float *)output;
902 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
903 	nb_iterations = nb_elements / vlen;
904 
905 	/* convert vlen elements in each iteration */
906 	for (i = 0; i < nb_iterations; i++) {
907 		__uint32_to_float32_neon_f32x4(input_buffer, output_buffer, scale, zero_point);
908 		input_buffer += vlen;
909 		output_buffer += vlen;
910 	}
911 
912 	/* convert leftover elements */
913 	i = i * vlen;
914 	for (; i < nb_elements; i++) {
915 		__uint32_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
916 		input_buffer++;
917 		output_buffer++;
918 	}
919 
920 	return 0;
921 }
922 
923 static inline void
924 __float32_to_int64_neon_s64x2(const float *input, int64_t *output, float scale, int64_t zero_point)
925 {
926 	float32x2_t f32x2;
927 	float64x2_t f64x2;
928 	int64x2_t s64x2;
929 	int64_t s64;
930 
931 	/* load 2 x float elements */
932 	f32x2 = vld1_f32(input);
933 
934 	/* scale */
935 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
936 
937 	/* add zero point */
938 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
939 
940 	/* convert to float64x2_t */
941 	f64x2 = vcvt_f64_f32(f32x2);
942 
943 	/* convert to int64x2_t */
944 	s64x2 = vcvtaq_s64_f64(f64x2);
945 	s64 = vgetq_lane_s64(s64x2, 0);
946 	s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64;
947 
948 	/* store lane 0 of int64x2_t */
949 	*output = s64;
950 }
951 
952 static inline void
953 __float32_to_int64_neon_s64x1(const float *input, int64_t *output, float scale, int64_t zero_point)
954 {
955 	float32x2_t f32x2;
956 	float64x2_t f64x2;
957 	int64x2_t s64x2;
958 	int64_t s64;
959 
960 	/* load 1 x float element */
961 	f32x2 = vdup_n_f32(*input);
962 
963 	/* scale */
964 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
965 
966 	/* add zero point */
967 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
968 
969 	/* convert to float64x2_t */
970 	f64x2 = vcvt_f64_f32(f32x2);
971 
972 	/* convert to int64x2_t */
973 	s64x2 = vcvtaq_s64_f64(f64x2);
974 	s64 = vgetq_lane_s64(s64x2, 0);
975 	s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64;
976 
977 	/* store lane 0 of int64x2_t */
978 	*output = s64;
979 }
980 
981 int
982 rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale,
983 			   int64_t zero_point)
984 {
985 	const float *input_buffer;
986 	int64_t *output_buffer;
987 	uint64_t nb_iterations;
988 	uint32_t vlen;
989 	uint64_t i;
990 
991 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
992 		return -EINVAL;
993 
994 	input_buffer = (const float *)input;
995 	output_buffer = (int64_t *)output;
996 	vlen = 4 * sizeof(float) / sizeof(int64_t);
997 	nb_iterations = nb_elements / vlen;
998 
999 	/* convert vlen elements in each iteration */
1000 	for (i = 0; i < nb_iterations; i++) {
1001 		__float32_to_int64_neon_s64x2(input_buffer, output_buffer, scale, zero_point);
1002 		input_buffer += vlen;
1003 		output_buffer += vlen;
1004 	}
1005 
1006 	/* convert leftover elements */
1007 	i = i * vlen;
1008 	for (; i < nb_elements; i++) {
1009 		__float32_to_int64_neon_s64x1(input_buffer, output_buffer, scale, zero_point);
1010 		input_buffer++;
1011 		output_buffer++;
1012 	}
1013 
1014 	return 0;
1015 }
1016 
1017 static inline void
1018 __int64_to_float32_neon_f32x2(const int64_t *input, float *output, float scale, int64_t zero_point)
1019 {
1020 	int64x2_t s64x2;
1021 	float64x2_t f64x2;
1022 	float32x2_t f32x2;
1023 
1024 	/* load 2 x int64_t elements */
1025 	s64x2 = vld1q_s64(input);
1026 
1027 	/* convert int64x2_t to float64x2_t */
1028 	f64x2 = vcvtq_f64_s64(s64x2);
1029 
1030 	/* convert float64x2_t to float32x2_t */
1031 	f32x2 = vcvt_f32_f64(f64x2);
1032 
1033 	/* subtract zero_point */
1034 	f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point));
1035 
1036 	/* scale */
1037 	f32x2 = vmul_n_f32(f32x2, scale);
1038 
1039 	/* store float32x2_t */
1040 	vst1_f32(output, f32x2);
1041 }
1042 
1043 static inline void
1044 __int64_to_float32_neon_f32x1(const int64_t *input, float *output, float scale, int64_t zero_point)
1045 {
1046 	int64x2_t s64x2;
1047 	float64x2_t f64x2;
1048 	float32x2_t f32x2;
1049 
1050 	/* load 2 x int64_t elements */
1051 	s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0);
1052 
1053 	/* convert int64x2_t to float64x2_t */
1054 	f64x2 = vcvtq_f64_s64(s64x2);
1055 
1056 	/* convert float64x2_t to float32x2_t */
1057 	f32x2 = vcvt_f32_f64(f64x2);
1058 
1059 	/* subtract zero_point */
1060 	f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point));
1061 
1062 	/* scale */
1063 	f32x2 = vmul_n_f32(f32x2, scale);
1064 
1065 	/* store float32x2_t lane 0 */
1066 	vst1_lane_f32(output, f32x2, 0);
1067 }
1068 
1069 int
1070 rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
1071 			   int64_t zero_point)
1072 {
1073 	const int64_t *input_buffer;
1074 	float *output_buffer;
1075 	uint64_t nb_iterations;
1076 	uint32_t vlen;
1077 	uint64_t i;
1078 
1079 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1080 		return -EINVAL;
1081 
1082 	input_buffer = (const int64_t *)input;
1083 	output_buffer = (float *)output;
1084 	vlen = 4 * sizeof(float) / sizeof(int64_t);
1085 	nb_iterations = nb_elements / vlen;
1086 
1087 	/* convert vlen elements in each iteration */
1088 	for (i = 0; i < nb_iterations; i++) {
1089 		__int64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point);
1090 		input_buffer += vlen;
1091 		output_buffer += vlen;
1092 	}
1093 
1094 	/* convert leftover elements */
1095 	i = i * vlen;
1096 	for (; i < nb_elements; i++) {
1097 		__int64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
1098 		input_buffer++;
1099 		output_buffer++;
1100 	}
1101 
1102 	return 0;
1103 }
1104 
1105 static inline void
1106 __float32_to_uint64_neon_u64x2(const float *input, uint64_t *output, float scale,
1107 			       uint64_t zero_point)
1108 {
1109 	float32x2_t f32x2;
1110 	float64x2_t f64x2;
1111 	uint64x2_t u64x2;
1112 
1113 	/* load 2 x float elements */
1114 	f32x2 = vld1_f32(input);
1115 
1116 	/* scale */
1117 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
1118 
1119 	/* add zero point */
1120 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
1121 
1122 	/* convert to float64x2_t */
1123 	f64x2 = vcvt_f64_f32(f32x2);
1124 
1125 	/* convert to int64x2_t */
1126 	u64x2 = vcvtaq_u64_f64(f64x2);
1127 
1128 	/* store 2 elements */
1129 	vst1q_u64(output, u64x2);
1130 }
1131 
1132 static inline void
1133 __float32_to_uint64_neon_u64x1(const float *input, uint64_t *output, float scale,
1134 			       uint64_t zero_point)
1135 {
1136 	float32x2_t f32x2;
1137 	float64x2_t f64x2;
1138 	uint64x2_t u64x2;
1139 
1140 	/* load 1 x float element */
1141 	f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0);
1142 
1143 	/* scale */
1144 	f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
1145 
1146 	/* add zero_point */
1147 	f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
1148 
1149 	/* convert to float64x2_t */
1150 	f64x2 = vcvt_f64_f32(f32x2);
1151 
1152 	/* convert to int64x2_t */
1153 	u64x2 = vcvtaq_u64_f64(f64x2);
1154 
1155 	/* store 2 elements */
1156 	vst1q_lane_u64(output, u64x2, 0);
1157 }
1158 
1159 int
1160 rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale,
1161 			   uint64_t zero_point)
1162 {
1163 	const float *input_buffer;
1164 	uint64_t *output_buffer;
1165 	uint64_t nb_iterations;
1166 	uint32_t vlen;
1167 	uint64_t i;
1168 
1169 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1170 		return -EINVAL;
1171 
1172 	input_buffer = (const float *)input;
1173 	output_buffer = (uint64_t *)output;
1174 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
1175 	nb_iterations = nb_elements / vlen;
1176 
1177 	/* convert vlen elements in each iteration */
1178 	for (i = 0; i < nb_iterations; i++) {
1179 		__float32_to_uint64_neon_u64x2(input_buffer, output_buffer, scale, zero_point);
1180 		input_buffer += vlen;
1181 		output_buffer += vlen;
1182 	}
1183 
1184 	/* convert leftover elements */
1185 	i = i * vlen;
1186 	for (; i < nb_elements; i++) {
1187 		__float32_to_uint64_neon_u64x1(input_buffer, output_buffer, scale, zero_point);
1188 		input_buffer++;
1189 		output_buffer++;
1190 	}
1191 
1192 	return 0;
1193 }
1194 
1195 static inline void
1196 __uint64_to_float32_neon_f32x2(const uint64_t *input, float *output, float scale,
1197 			       uint64_t zero_point)
1198 {
1199 	uint64x2_t u64x2;
1200 	float64x2_t f64x2;
1201 	float32x2_t f32x2;
1202 
1203 	/* load 2 x int64_t elements */
1204 	u64x2 = vld1q_u64(input);
1205 
1206 	/* convert int64x2_t to float64x2_t */
1207 	f64x2 = vcvtq_f64_u64(u64x2);
1208 
1209 	/* convert float64x2_t to float32x2_t */
1210 	f32x2 = vcvt_f32_f64(f64x2);
1211 
1212 	/* subtract zero_point */
1213 	f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point));
1214 
1215 	/* scale */
1216 	f32x2 = vmul_n_f32(f32x2, scale);
1217 
1218 	/* store float32x2_t */
1219 	vst1_f32(output, f32x2);
1220 }
1221 
1222 static inline void
1223 __uint64_to_float32_neon_f32x1(const uint64_t *input, float *output, float scale,
1224 			       uint64_t zero_point)
1225 {
1226 	uint64x2_t u64x2;
1227 	float64x2_t f64x2;
1228 	float32x2_t f32x2;
1229 
1230 	/* load 2 x int64_t elements */
1231 	u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0);
1232 
1233 	/* convert int64x2_t to float64x2_t */
1234 	f64x2 = vcvtq_f64_u64(u64x2);
1235 
1236 	/* convert float64x2_t to float32x2_t */
1237 	f32x2 = vcvt_f32_f64(f64x2);
1238 
1239 	/* subtract zero_point */
1240 	f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point));
1241 
1242 	/* scale */
1243 	f32x2 = vmul_n_f32(f32x2, scale);
1244 
1245 	/* store float32x2_t lane 0 */
1246 	vst1_lane_f32(output, f32x2, 0);
1247 }
1248 
1249 int
1250 rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
1251 			   uint64_t zero_point)
1252 {
1253 	const uint64_t *input_buffer;
1254 	float *output_buffer;
1255 	uint64_t nb_iterations;
1256 	uint32_t vlen;
1257 	uint64_t i;
1258 
1259 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1260 		return -EINVAL;
1261 
1262 	input_buffer = (const uint64_t *)input;
1263 	output_buffer = (float *)output;
1264 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
1265 	nb_iterations = nb_elements / vlen;
1266 
1267 	/* convert vlen elements in each iteration */
1268 	for (i = 0; i < nb_iterations; i++) {
1269 		__uint64_to_float32_neon_f32x2(input_buffer, output_buffer, scale, zero_point);
1270 		input_buffer += vlen;
1271 		output_buffer += vlen;
1272 	}
1273 
1274 	/* convert leftover elements */
1275 	i = i * vlen;
1276 	for (; i < nb_elements; i++) {
1277 		__uint64_to_float32_neon_f32x1(input_buffer, output_buffer, scale, zero_point);
1278 		input_buffer++;
1279 		output_buffer++;
1280 	}
1281 
1282 	return 0;
1283 }
1284 
1285 static inline void
1286 __float32_to_float16_neon_f16x4(const float32_t *input, float16_t *output)
1287 {
1288 	float32x4_t f32x4;
1289 	float16x4_t f16x4;
1290 
1291 	/* load 4 x float32_t elements */
1292 	f32x4 = vld1q_f32(input);
1293 
1294 	/* convert to float16x4_t */
1295 	f16x4 = vcvt_f16_f32(f32x4);
1296 
1297 	/* store float16x4_t */
1298 	vst1_f16(output, f16x4);
1299 }
1300 
1301 static inline void
1302 __float32_to_float16_neon_f16x1(const float32_t *input, float16_t *output)
1303 {
1304 	float32x4_t f32x4;
1305 	float16x4_t f16x4;
1306 
1307 	/* load element to 4 lanes */
1308 	f32x4 = vld1q_dup_f32(input);
1309 
1310 	/* convert float32_t to float16_t */
1311 	f16x4 = vcvt_f16_f32(f32x4);
1312 
1313 	/* store lane 0 / 1 element */
1314 	vst1_lane_f16(output, f16x4, 0);
1315 }
1316 
1317 int
1318 rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements)
1319 {
1320 	const float32_t *input_buffer;
1321 	float16_t *output_buffer;
1322 	uint64_t nb_iterations;
1323 	uint32_t vlen;
1324 	uint64_t i;
1325 
1326 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1327 		return -EINVAL;
1328 
1329 	input_buffer = (const float32_t *)input;
1330 	output_buffer = (float16_t *)output;
1331 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1332 	nb_iterations = nb_elements / vlen;
1333 
1334 	/* convert vlen elements in each iteration */
1335 	for (i = 0; i < nb_iterations; i++) {
1336 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
1337 		input_buffer += vlen;
1338 		output_buffer += vlen;
1339 	}
1340 
1341 	/* convert leftover elements */
1342 	i = i * vlen;
1343 	for (; i < nb_elements; i++) {
1344 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
1345 		input_buffer++;
1346 		output_buffer++;
1347 	}
1348 
1349 	return 0;
1350 }
1351 
1352 static inline void
1353 __float16_to_float32_neon_f32x4(const float16_t *input, float32_t *output)
1354 {
1355 	float16x4_t f16x4;
1356 	float32x4_t f32x4;
1357 
1358 	/* load 4 x float16_t elements */
1359 	f16x4 = vld1_f16(input);
1360 
1361 	/* convert float16x4_t to float32x4_t */
1362 	f32x4 = vcvt_f32_f16(f16x4);
1363 
1364 	/* store float32x4_t */
1365 	vst1q_f32(output, f32x4);
1366 }
1367 
1368 static inline void
1369 __float16_to_float32_neon_f32x1(const float16_t *input, float32_t *output)
1370 {
1371 	float16x4_t f16x4;
1372 	float32x4_t f32x4;
1373 
1374 	/* load element to 4 lanes */
1375 	f16x4 = vld1_dup_f16(input);
1376 
1377 	/* convert float16_t to float32_t */
1378 	f32x4 = vcvt_f32_f16(f16x4);
1379 
1380 	/* store 1 element */
1381 	vst1q_lane_f32(output, f32x4, 0);
1382 }
1383 
1384 int
1385 rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements)
1386 {
1387 	const float16_t *input_buffer;
1388 	float32_t *output_buffer;
1389 	uint64_t nb_iterations;
1390 	uint32_t vlen;
1391 	uint64_t i;
1392 
1393 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1394 		return -EINVAL;
1395 
1396 	input_buffer = (const float16_t *)input;
1397 	output_buffer = (float32_t *)output;
1398 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1399 	nb_iterations = nb_elements / vlen;
1400 
1401 	/* convert vlen elements in each iteration */
1402 	for (i = 0; i < nb_iterations; i++) {
1403 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
1404 		input_buffer += vlen;
1405 		output_buffer += vlen;
1406 	}
1407 
1408 	/* convert leftover elements */
1409 	i = i * vlen;
1410 	for (; i < nb_elements; i++) {
1411 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
1412 		input_buffer++;
1413 		output_buffer++;
1414 	}
1415 
1416 	return 0;
1417 }
1418