xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision c6552d9a8deffa448de2d5e2e726f50508c1efd2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2022 Marvell.
3  */
4 
5 #include <errno.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 
9 #include "mldev_utils.h"
10 
11 #include <arm_neon.h>
12 
13 /* Description:
14  * This file implements vector versions of Machine Learning utility functions used to convert data
15  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16  * is based on Arm Neon intrinsics.
17  */
18 
19 static inline void
20 __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
21 {
22 	int16x4_t s16x4_l;
23 	int16x4_t s16x4_h;
24 	float32x4_t f32x4;
25 	int16x8_t s16x8;
26 	int32x4_t s32x4;
27 	int8x8_t s8x8;
28 
29 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30 	 * Use round to nearest with ties away rounding mode.
31 	 */
32 	f32x4 = vld1q_f32(input);
33 	f32x4 = vmulq_n_f32(f32x4, scale);
34 	s32x4 = vcvtaq_s32_f32(f32x4);
35 	s16x4_l = vqmovn_s32(s32x4);
36 
37 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
38 	 * Use round to nearest with ties away rounding mode.
39 	 */
40 	f32x4 = vld1q_f32(input + 4);
41 	f32x4 = vmulq_n_f32(f32x4, scale);
42 	s32x4 = vcvtaq_s32_f32(f32x4);
43 	s16x4_h = vqmovn_s32(s32x4);
44 
45 	/* combine lower and higher int16x4_t to int16x8_t */
46 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
47 
48 	/* narrow to int8_t */
49 	s8x8 = vqmovn_s16(s16x8);
50 
51 	/* store 8 elements */
52 	vst1_s8(output, s8x8);
53 }
54 
55 static inline void
56 __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
57 {
58 	int32_t s32;
59 	int16_t s16;
60 
61 	/* scale and convert, round to nearest with ties away rounding mode */
62 	s32 = vcvtas_s32_f32(scale * (*input));
63 
64 	/* saturate narrow */
65 	s16 = vqmovns_s32(s32);
66 
67 	/* convert to int8_t */
68 	*output = vqmovnh_s16(s16);
69 }
70 
71 int
72 rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
73 {
74 	float *input_buffer;
75 	int8_t *output_buffer;
76 	uint64_t nb_iterations;
77 	uint32_t vlen;
78 	uint64_t i;
79 
80 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
81 		return -EINVAL;
82 
83 	input_buffer = (float *)input;
84 	output_buffer = (int8_t *)output;
85 	vlen = 2 * sizeof(float) / sizeof(int8_t);
86 	nb_iterations = nb_elements / vlen;
87 
88 	/* convert vlen elements in each iteration */
89 	for (i = 0; i < nb_iterations; i++) {
90 		__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
91 		input_buffer += vlen;
92 		output_buffer += vlen;
93 	}
94 
95 	/* convert leftover elements */
96 	i = i * vlen;
97 	for (; i < nb_elements; i++) {
98 		__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
99 		input_buffer++;
100 		output_buffer++;
101 	}
102 
103 	return 0;
104 }
105 
106 static inline void
107 __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
108 {
109 	float32x4_t f32x4;
110 	int16x8_t s16x8;
111 	int16x4_t s16x4;
112 	int32x4_t s32x4;
113 	int8x8_t s8x8;
114 
115 	/* load 8 x int8_t elements */
116 	s8x8 = vld1_s8(input);
117 
118 	/* widen int8_t to int16_t */
119 	s16x8 = vmovl_s8(s8x8);
120 
121 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
122 	s16x4 = vget_low_s16(s16x8);
123 	s32x4 = vmovl_s16(s16x4);
124 	f32x4 = vcvtq_f32_s32(s32x4);
125 	f32x4 = vmulq_n_f32(f32x4, scale);
126 	vst1q_f32(output, f32x4);
127 
128 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
129 	s16x4 = vget_high_s16(s16x8);
130 	s32x4 = vmovl_s16(s16x4);
131 	f32x4 = vcvtq_f32_s32(s32x4);
132 	f32x4 = vmulq_n_f32(f32x4, scale);
133 	vst1q_f32(output + 4, f32x4);
134 }
135 
136 static inline void
137 __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
138 {
139 	*output = scale * vcvts_f32_s32((int32_t)*input);
140 }
141 
142 int
143 rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
144 {
145 	int8_t *input_buffer;
146 	float *output_buffer;
147 	uint64_t nb_iterations;
148 	uint32_t vlen;
149 	uint64_t i;
150 
151 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
152 		return -EINVAL;
153 
154 	input_buffer = (int8_t *)input;
155 	output_buffer = (float *)output;
156 	vlen = 2 * sizeof(float) / sizeof(int8_t);
157 	nb_iterations = nb_elements / vlen;
158 
159 	/* convert vlen elements in each iteration */
160 	for (i = 0; i < nb_iterations; i++) {
161 		__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
162 		input_buffer += vlen;
163 		output_buffer += vlen;
164 	}
165 
166 	/* convert leftover elements */
167 	i = i * vlen;
168 	for (; i < nb_elements; i++) {
169 		__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
170 		input_buffer++;
171 		output_buffer++;
172 	}
173 
174 	return 0;
175 }
176 
177 static inline void
178 __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
179 {
180 	uint16x4_t u16x4_l;
181 	uint16x4_t u16x4_h;
182 	float32x4_t f32x4;
183 	uint32x4_t u32x4;
184 	uint16x8_t u16x8;
185 	uint8x8_t u8x8;
186 
187 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
188 	 * use round to nearest with ties away rounding mode.
189 	 */
190 	f32x4 = vld1q_f32(input);
191 	f32x4 = vmulq_n_f32(f32x4, scale);
192 	u32x4 = vcvtaq_u32_f32(f32x4);
193 	u16x4_l = vqmovn_u32(u32x4);
194 
195 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
196 	 * use round to nearest with ties away rounding mode.
197 	 */
198 	f32x4 = vld1q_f32(input + 4);
199 	f32x4 = vmulq_n_f32(f32x4, scale);
200 	u32x4 = vcvtaq_u32_f32(f32x4);
201 	u16x4_h = vqmovn_u32(u32x4);
202 
203 	/* combine lower and higher uint16x4_t */
204 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
205 
206 	/* narrow to uint8x8_t */
207 	u8x8 = vqmovn_u16(u16x8);
208 
209 	/* store 8 elements */
210 	vst1_u8(output, u8x8);
211 }
212 
213 static inline void
214 __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
215 {
216 	uint32_t u32;
217 	uint16_t u16;
218 
219 	/* scale and convert, round to nearest with ties away rounding mode */
220 	u32 = vcvtas_u32_f32(scale * (*input));
221 
222 	/* saturate narrow */
223 	u16 = vqmovns_u32(u32);
224 
225 	/* convert to uint8_t */
226 	*output = vqmovnh_u16(u16);
227 }
228 
229 int
230 rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
231 {
232 	float *input_buffer;
233 	uint8_t *output_buffer;
234 	uint64_t nb_iterations;
235 	uint32_t vlen;
236 	uint64_t i;
237 
238 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
239 		return -EINVAL;
240 
241 	input_buffer = (float *)input;
242 	output_buffer = (uint8_t *)output;
243 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
244 	nb_iterations = nb_elements / vlen;
245 
246 	/* convert vlen elements in each iteration */
247 	for (i = 0; i < nb_iterations; i++) {
248 		__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
249 		input_buffer += vlen;
250 		output_buffer += vlen;
251 	}
252 
253 	/* convert leftover elements */
254 	i = i * vlen;
255 	for (; i < nb_elements; i++) {
256 		__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
257 		input_buffer++;
258 		output_buffer++;
259 	}
260 
261 	return 0;
262 }
263 
264 static inline void
265 __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
266 {
267 	float32x4_t f32x4;
268 	uint16x8_t u16x8;
269 	uint16x4_t u16x4;
270 	uint32x4_t u32x4;
271 	uint8x8_t u8x8;
272 
273 	/* load 8 x uint8_t elements */
274 	u8x8 = vld1_u8(input);
275 
276 	/* widen uint8_t to uint16_t */
277 	u16x8 = vmovl_u8(u8x8);
278 
279 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
280 	u16x4 = vget_low_u16(u16x8);
281 	u32x4 = vmovl_u16(u16x4);
282 	f32x4 = vcvtq_f32_u32(u32x4);
283 	f32x4 = vmulq_n_f32(f32x4, scale);
284 	vst1q_f32(output, f32x4);
285 
286 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
287 	u16x4 = vget_high_u16(u16x8);
288 	u32x4 = vmovl_u16(u16x4);
289 	f32x4 = vcvtq_f32_u32(u32x4);
290 	f32x4 = vmulq_n_f32(f32x4, scale);
291 	vst1q_f32(output + 4, f32x4);
292 }
293 
294 static inline void
295 __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
296 {
297 	*output = scale * vcvts_f32_u32((uint32_t)*input);
298 }
299 
300 int
301 rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
302 {
303 	uint8_t *input_buffer;
304 	float *output_buffer;
305 	uint64_t nb_iterations;
306 	uint64_t vlen;
307 	uint64_t i;
308 
309 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
310 		return -EINVAL;
311 
312 	input_buffer = (uint8_t *)input;
313 	output_buffer = (float *)output;
314 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
315 	nb_iterations = nb_elements / vlen;
316 
317 	/* convert vlen elements in each iteration */
318 	for (i = 0; i < nb_iterations; i++) {
319 		__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
320 		input_buffer += vlen;
321 		output_buffer += vlen;
322 	}
323 
324 	/* convert leftover elements */
325 	i = i * vlen;
326 	for (; i < nb_elements; i++) {
327 		__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
328 		input_buffer++;
329 		output_buffer++;
330 	}
331 
332 	return 0;
333 }
334 
335 static inline void
336 __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
337 {
338 	float32x4_t f32x4;
339 	int16x4_t s16x4;
340 	int32x4_t s32x4;
341 
342 	/* load 4 x float elements */
343 	f32x4 = vld1q_f32(input);
344 
345 	/* scale */
346 	f32x4 = vmulq_n_f32(f32x4, scale);
347 
348 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
349 	s32x4 = vcvtaq_s32_f32(f32x4);
350 
351 	/* saturate narrow to int16x4_t */
352 	s16x4 = vqmovn_s32(s32x4);
353 
354 	/* store 4 elements */
355 	vst1_s16(output, s16x4);
356 }
357 
358 static inline void
359 __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
360 {
361 	int32_t s32;
362 
363 	/* scale and convert, round to nearest with ties away rounding mode */
364 	s32 = vcvtas_s32_f32(scale * (*input));
365 
366 	/* saturate narrow */
367 	*output = vqmovns_s32(s32);
368 }
369 
370 int
371 rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
372 {
373 	float *input_buffer;
374 	int16_t *output_buffer;
375 	uint64_t nb_iterations;
376 	uint32_t vlen;
377 	uint64_t i;
378 
379 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
380 		return -EINVAL;
381 
382 	input_buffer = (float *)input;
383 	output_buffer = (int16_t *)output;
384 	vlen = 2 * sizeof(float) / sizeof(int16_t);
385 	nb_iterations = nb_elements / vlen;
386 
387 	/* convert vlen elements in each iteration */
388 	for (i = 0; i < nb_iterations; i++) {
389 		__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
390 		input_buffer += vlen;
391 		output_buffer += vlen;
392 	}
393 
394 	/* convert leftover elements */
395 	i = i * vlen;
396 	for (; i < nb_elements; i++) {
397 		__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
398 		input_buffer++;
399 		output_buffer++;
400 	}
401 
402 	return 0;
403 }
404 
405 static inline void
406 __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
407 {
408 	float32x4_t f32x4;
409 	int16x4_t s16x4;
410 	int32x4_t s32x4;
411 
412 	/* load 4 x int16_t elements */
413 	s16x4 = vld1_s16(input);
414 
415 	/* widen int16_t to int32_t */
416 	s32x4 = vmovl_s16(s16x4);
417 
418 	/* convert int32_t to float */
419 	f32x4 = vcvtq_f32_s32(s32x4);
420 
421 	/* scale */
422 	f32x4 = vmulq_n_f32(f32x4, scale);
423 
424 	/* store float32x4_t */
425 	vst1q_f32(output, f32x4);
426 }
427 
428 static inline void
429 __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
430 {
431 	*output = scale * vcvts_f32_s32((int32_t)*input);
432 }
433 
434 int
435 rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
436 {
437 	int16_t *input_buffer;
438 	float *output_buffer;
439 	uint64_t nb_iterations;
440 	uint32_t vlen;
441 	uint64_t i;
442 
443 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
444 		return -EINVAL;
445 
446 	input_buffer = (int16_t *)input;
447 	output_buffer = (float *)output;
448 	vlen = 2 * sizeof(float) / sizeof(int16_t);
449 	nb_iterations = nb_elements / vlen;
450 
451 	/* convert vlen elements in each iteration */
452 	for (i = 0; i < nb_iterations; i++) {
453 		__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
454 		input_buffer += vlen;
455 		output_buffer += vlen;
456 	}
457 
458 	/* convert leftover elements */
459 	i = i * vlen;
460 	for (; i < nb_elements; i++) {
461 		__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
462 		input_buffer++;
463 		output_buffer++;
464 	}
465 
466 	return 0;
467 }
468 
469 static inline void
470 __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
471 {
472 	float32x4_t f32x4;
473 	uint16x4_t u16x4;
474 	uint32x4_t u32x4;
475 
476 	/* load 4 float elements */
477 	f32x4 = vld1q_f32(input);
478 
479 	/* scale */
480 	f32x4 = vmulq_n_f32(f32x4, scale);
481 
482 	/* convert using round to nearest with ties to away rounding mode */
483 	u32x4 = vcvtaq_u32_f32(f32x4);
484 
485 	/* saturate narrow */
486 	u16x4 = vqmovn_u32(u32x4);
487 
488 	/* store 4 elements */
489 	vst1_u16(output, u16x4);
490 }
491 
492 static inline void
493 __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
494 {
495 	uint32_t u32;
496 
497 	/* scale and convert, round to nearest with ties away rounding mode */
498 	u32 = vcvtas_u32_f32(scale * (*input));
499 
500 	/* saturate narrow */
501 	*output = vqmovns_u32(u32);
502 }
503 
504 int
505 rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
506 {
507 	float *input_buffer;
508 	uint16_t *output_buffer;
509 	uint64_t nb_iterations;
510 	uint64_t vlen;
511 	uint64_t i;
512 
513 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
514 		return -EINVAL;
515 
516 	input_buffer = (float *)input;
517 	output_buffer = (uint16_t *)output;
518 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
519 	nb_iterations = nb_elements / vlen;
520 
521 	/* convert vlen elements in each iteration */
522 	for (i = 0; i < nb_iterations; i++) {
523 		__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
524 		input_buffer += vlen;
525 		output_buffer += vlen;
526 	}
527 
528 	/* convert leftover elements */
529 	i = i * vlen;
530 	for (; i < nb_elements; i++) {
531 		__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
532 		input_buffer++;
533 		output_buffer++;
534 	}
535 
536 	return 0;
537 }
538 
539 static inline void
540 __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
541 {
542 	float32x4_t f32x4;
543 	uint16x4_t u16x4;
544 	uint32x4_t u32x4;
545 
546 	/* load 4 x uint16_t elements */
547 	u16x4 = vld1_u16(input);
548 
549 	/* widen uint16_t to uint32_t */
550 	u32x4 = vmovl_u16(u16x4);
551 
552 	/* convert uint32_t to float */
553 	f32x4 = vcvtq_f32_u32(u32x4);
554 
555 	/* scale */
556 	f32x4 = vmulq_n_f32(f32x4, scale);
557 
558 	/* store float32x4_t */
559 	vst1q_f32(output, f32x4);
560 }
561 
562 static inline void
563 __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
564 {
565 	*output = scale * vcvts_f32_u32((uint32_t)*input);
566 }
567 
568 int
569 rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
570 {
571 	uint16_t *input_buffer;
572 	float *output_buffer;
573 	uint64_t nb_iterations;
574 	uint32_t vlen;
575 	uint64_t i;
576 
577 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
578 		return -EINVAL;
579 
580 	input_buffer = (uint16_t *)input;
581 	output_buffer = (float *)output;
582 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
583 	nb_iterations = nb_elements / vlen;
584 
585 	/* convert vlen elements in each iteration */
586 	for (i = 0; i < nb_iterations; i++) {
587 		__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
588 		input_buffer += vlen;
589 		output_buffer += vlen;
590 	}
591 
592 	/* convert leftover elements */
593 	i = i * vlen;
594 	for (; i < nb_elements; i++) {
595 		__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
596 		input_buffer++;
597 		output_buffer++;
598 	}
599 
600 	return 0;
601 }
602 
603 static inline void
604 __float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output)
605 {
606 	float32x4_t f32x4;
607 	int32x4_t s32x4;
608 
609 	/* load 4 x float elements */
610 	f32x4 = vld1q_f32(input);
611 
612 	/* scale */
613 	f32x4 = vmulq_n_f32(f32x4, scale);
614 
615 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
616 	s32x4 = vcvtaq_s32_f32(f32x4);
617 
618 	/* store 4 elements */
619 	vst1q_s32(output, s32x4);
620 }
621 
622 static inline void
623 __float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output)
624 {
625 	/* scale and convert, round to nearest with ties away rounding mode */
626 	*output = vcvtas_s32_f32(scale * (*input));
627 }
628 
629 int
630 rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, void *output)
631 {
632 	float *input_buffer;
633 	int32_t *output_buffer;
634 	uint64_t nb_iterations;
635 	uint32_t vlen;
636 	uint64_t i;
637 
638 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
639 		return -EINVAL;
640 
641 	input_buffer = (float *)input;
642 	output_buffer = (int32_t *)output;
643 	vlen = 2 * sizeof(float) / sizeof(int32_t);
644 	nb_iterations = nb_elements / vlen;
645 
646 	/* convert vlen elements in each iteration */
647 	for (i = 0; i < nb_iterations; i++) {
648 		__float32_to_int32_neon_s32x4(scale, input_buffer, output_buffer);
649 		input_buffer += vlen;
650 		output_buffer += vlen;
651 	}
652 
653 	/* convert leftover elements */
654 	i = i * vlen;
655 	for (; i < nb_elements; i++) {
656 		__float32_to_int32_neon_s32x1(scale, input_buffer, output_buffer);
657 		input_buffer++;
658 		output_buffer++;
659 	}
660 
661 	return 0;
662 }
663 
664 static inline void
665 __int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output)
666 {
667 	float32x4_t f32x4;
668 	int32x4_t s32x4;
669 
670 	/* load 4 x int32_t elements */
671 	s32x4 = vld1q_s32(input);
672 
673 	/* convert int32_t to float */
674 	f32x4 = vcvtq_f32_s32(s32x4);
675 
676 	/* scale */
677 	f32x4 = vmulq_n_f32(f32x4, scale);
678 
679 	/* store float32x4_t */
680 	vst1q_f32(output, f32x4);
681 }
682 
683 static inline void
684 __int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output)
685 {
686 	*output = scale * vcvts_f32_s32(*input);
687 }
688 
689 int
690 rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
691 {
692 	int32_t *input_buffer;
693 	float *output_buffer;
694 	uint64_t nb_iterations;
695 	uint32_t vlen;
696 	uint64_t i;
697 
698 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
699 		return -EINVAL;
700 
701 	input_buffer = (int32_t *)input;
702 	output_buffer = (float *)output;
703 	vlen = 2 * sizeof(float) / sizeof(int32_t);
704 	nb_iterations = nb_elements / vlen;
705 
706 	/* convert vlen elements in each iteration */
707 	for (i = 0; i < nb_iterations; i++) {
708 		__int32_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
709 		input_buffer += vlen;
710 		output_buffer += vlen;
711 	}
712 
713 	/* convert leftover elements */
714 	i = i * vlen;
715 	for (; i < nb_elements; i++) {
716 		__int32_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
717 		input_buffer++;
718 		output_buffer++;
719 	}
720 
721 	return 0;
722 }
723 
724 static inline void
725 __float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output)
726 {
727 	float32x4_t f32x4;
728 	uint32x4_t u32x4;
729 
730 	/* load 4 float elements */
731 	f32x4 = vld1q_f32(input);
732 
733 	/* scale */
734 	f32x4 = vmulq_n_f32(f32x4, scale);
735 
736 	/* convert using round to nearest with ties to away rounding mode */
737 	u32x4 = vcvtaq_u32_f32(f32x4);
738 
739 	/* store 4 elements */
740 	vst1q_u32(output, u32x4);
741 }
742 
743 static inline void
744 __float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output)
745 {
746 	/* scale and convert, round to nearest with ties away rounding mode */
747 	*output = vcvtas_u32_f32(scale * (*input));
748 }
749 
750 int
751 rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, void *output)
752 {
753 	float *input_buffer;
754 	uint32_t *output_buffer;
755 	uint64_t nb_iterations;
756 	uint64_t vlen;
757 	uint64_t i;
758 
759 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
760 		return -EINVAL;
761 
762 	input_buffer = (float *)input;
763 	output_buffer = (uint32_t *)output;
764 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
765 	nb_iterations = nb_elements / vlen;
766 
767 	/* convert vlen elements in each iteration */
768 	for (i = 0; i < nb_iterations; i++) {
769 		__float32_to_uint32_neon_u32x4(scale, input_buffer, output_buffer);
770 		input_buffer += vlen;
771 		output_buffer += vlen;
772 	}
773 
774 	/* convert leftover elements */
775 	i = i * vlen;
776 	for (; i < nb_elements; i++) {
777 		__float32_to_uint32_neon_u32x1(scale, input_buffer, output_buffer);
778 		input_buffer++;
779 		output_buffer++;
780 	}
781 
782 	return 0;
783 }
784 
785 static inline void
786 __uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output)
787 {
788 	float32x4_t f32x4;
789 	uint32x4_t u32x4;
790 
791 	/* load 4 x uint32_t elements */
792 	u32x4 = vld1q_u32(input);
793 
794 	/* convert uint32_t to float */
795 	f32x4 = vcvtq_f32_u32(u32x4);
796 
797 	/* scale */
798 	f32x4 = vmulq_n_f32(f32x4, scale);
799 
800 	/* store float32x4_t */
801 	vst1q_f32(output, f32x4);
802 }
803 
804 static inline void
805 __uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output)
806 {
807 	*output = scale * vcvts_f32_u32(*input);
808 }
809 
810 int
811 rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
812 {
813 	uint32_t *input_buffer;
814 	float *output_buffer;
815 	uint64_t nb_iterations;
816 	uint32_t vlen;
817 	uint64_t i;
818 
819 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
820 		return -EINVAL;
821 
822 	input_buffer = (uint32_t *)input;
823 	output_buffer = (float *)output;
824 	vlen = 2 * sizeof(float) / sizeof(uint32_t);
825 	nb_iterations = nb_elements / vlen;
826 
827 	/* convert vlen elements in each iteration */
828 	for (i = 0; i < nb_iterations; i++) {
829 		__uint32_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
830 		input_buffer += vlen;
831 		output_buffer += vlen;
832 	}
833 
834 	/* convert leftover elements */
835 	i = i * vlen;
836 	for (; i < nb_elements; i++) {
837 		__uint32_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
838 		input_buffer++;
839 		output_buffer++;
840 	}
841 
842 	return 0;
843 }
844 
845 static inline void
846 __float32_to_int64_neon_s64x2(float scale, float *input, int64_t *output)
847 {
848 	float32x2_t f32x2;
849 	float64x2_t f64x2;
850 	int64x2_t s64x2;
851 
852 	/* load 2 x float elements */
853 	f32x2 = vld1_f32(input);
854 
855 	/* scale */
856 	f32x2 = vmul_n_f32(f32x2, scale);
857 
858 	/* convert to float64x2_t */
859 	f64x2 = vcvt_f64_f32(f32x2);
860 
861 	/* convert to int64x2_t */
862 	s64x2 = vcvtaq_s64_f64(f64x2);
863 
864 	/* store 2 elements */
865 	vst1q_s64(output, s64x2);
866 }
867 
868 static inline void
869 __float32_to_int64_neon_s64x1(float scale, float *input, int64_t *output)
870 {
871 	float32x2_t f32x2;
872 	float64x2_t f64x2;
873 	int64x2_t s64x2;
874 
875 	/* load 1 x float element */
876 	f32x2 = vdup_n_f32(*input);
877 
878 	/* scale */
879 	f32x2 = vmul_n_f32(f32x2, scale);
880 
881 	/* convert to float64x2_t */
882 	f64x2 = vcvt_f64_f32(f32x2);
883 
884 	/* convert to int64x2_t */
885 	s64x2 = vcvtaq_s64_f64(f64x2);
886 
887 	/* store lane 0 of int64x2_t */
888 	vst1q_lane_s64(output, s64x2, 0);
889 }
890 
891 int
892 rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, void *output)
893 {
894 	float *input_buffer;
895 	int64_t *output_buffer;
896 	uint64_t nb_iterations;
897 	uint32_t vlen;
898 	uint64_t i;
899 
900 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
901 		return -EINVAL;
902 
903 	input_buffer = (float *)input;
904 	output_buffer = (int64_t *)output;
905 	vlen = 4 * sizeof(float) / sizeof(int64_t);
906 	nb_iterations = nb_elements / vlen;
907 
908 	/* convert vlen elements in each iteration */
909 	for (i = 0; i < nb_iterations; i++) {
910 		__float32_to_int64_neon_s64x2(scale, input_buffer, output_buffer);
911 		input_buffer += vlen;
912 		output_buffer += vlen;
913 	}
914 
915 	/* convert leftover elements */
916 	i = i * vlen;
917 	for (; i < nb_elements; i++) {
918 		__float32_to_int64_neon_s64x1(scale, input_buffer, output_buffer);
919 		input_buffer++;
920 		output_buffer++;
921 	}
922 
923 	return 0;
924 }
925 
926 static inline void
927 __int64_to_float32_neon_f32x2(float scale, int64_t *input, float *output)
928 {
929 	int64x2_t s64x2;
930 	float64x2_t f64x2;
931 	float32x2_t f32x2;
932 
933 	/* load 2 x int64_t elements */
934 	s64x2 = vld1q_s64(input);
935 
936 	/* convert int64x2_t to float64x2_t */
937 	f64x2 = vcvtq_f64_s64(s64x2);
938 
939 	/* convert float64x2_t to float32x2_t */
940 	f32x2 = vcvt_f32_f64(f64x2);
941 
942 	/* scale */
943 	f32x2 = vmul_n_f32(f32x2, scale);
944 
945 	/* store float32x2_t */
946 	vst1_f32(output, f32x2);
947 }
948 
949 static inline void
950 __int64_to_float32_neon_f32x1(float scale, int64_t *input, float *output)
951 {
952 	int64x2_t s64x2;
953 	float64x2_t f64x2;
954 	float32x2_t f32x2;
955 
956 	/* load 2 x int64_t elements */
957 	s64x2 = vld1q_lane_s64(input, vdupq_n_s64(0), 0);
958 
959 	/* convert int64x2_t to float64x2_t */
960 	f64x2 = vcvtq_f64_s64(s64x2);
961 
962 	/* convert float64x2_t to float32x2_t */
963 	f32x2 = vcvt_f32_f64(f64x2);
964 
965 	/* scale */
966 	f32x2 = vmul_n_f32(f32x2, scale);
967 
968 	/* store float32x2_t */
969 	vst1_lane_f32(output, f32x2, 0);
970 }
971 
972 int
973 rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
974 {
975 	int64_t *input_buffer;
976 	float *output_buffer;
977 	uint64_t nb_iterations;
978 	uint32_t vlen;
979 	uint64_t i;
980 
981 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
982 		return -EINVAL;
983 
984 	input_buffer = (int64_t *)input;
985 	output_buffer = (float *)output;
986 	vlen = 4 * sizeof(float) / sizeof(int64_t);
987 	nb_iterations = nb_elements / vlen;
988 
989 	/* convert vlen elements in each iteration */
990 	for (i = 0; i < nb_iterations; i++) {
991 		__int64_to_float32_neon_f32x2(scale, input_buffer, output_buffer);
992 		input_buffer += vlen;
993 		output_buffer += vlen;
994 	}
995 
996 	/* convert leftover elements */
997 	i = i * vlen;
998 	for (; i < nb_elements; i++) {
999 		__int64_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
1000 		input_buffer++;
1001 		output_buffer++;
1002 	}
1003 
1004 	return 0;
1005 }
1006 
1007 static inline void
1008 __float32_to_uint64_neon_u64x2(float scale, float *input, uint64_t *output)
1009 {
1010 	float32x2_t f32x2;
1011 	float64x2_t f64x2;
1012 	uint64x2_t u64x2;
1013 
1014 	/* load 2 x float elements */
1015 	f32x2 = vld1_f32(input);
1016 
1017 	/* scale */
1018 	f32x2 = vmul_n_f32(f32x2, scale);
1019 
1020 	/* convert to float64x2_t */
1021 	f64x2 = vcvt_f64_f32(f32x2);
1022 
1023 	/* convert to int64x2_t */
1024 	u64x2 = vcvtaq_u64_f64(f64x2);
1025 
1026 	/* store 2 elements */
1027 	vst1q_u64(output, u64x2);
1028 }
1029 
1030 static inline void
1031 __float32_to_uint64_neon_u64x1(float scale, float *input, uint64_t *output)
1032 {
1033 	float32x2_t f32x2;
1034 	float64x2_t f64x2;
1035 	uint64x2_t u64x2;
1036 
1037 	/* load 1 x float element */
1038 	f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0);
1039 
1040 	/* scale */
1041 	f32x2 = vmul_n_f32(f32x2, scale);
1042 
1043 	/* convert to float64x2_t */
1044 	f64x2 = vcvt_f64_f32(f32x2);
1045 
1046 	/* convert to int64x2_t */
1047 	u64x2 = vcvtaq_u64_f64(f64x2);
1048 
1049 	/* store 2 elements */
1050 	vst1q_lane_u64(output, u64x2, 0);
1051 }
1052 
1053 int
1054 rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, void *output)
1055 {
1056 	float *input_buffer;
1057 	uint64_t *output_buffer;
1058 	uint64_t nb_iterations;
1059 	uint32_t vlen;
1060 	uint64_t i;
1061 
1062 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1063 		return -EINVAL;
1064 
1065 	input_buffer = (float *)input;
1066 	output_buffer = (uint64_t *)output;
1067 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
1068 	nb_iterations = nb_elements / vlen;
1069 
1070 	/* convert vlen elements in each iteration */
1071 	for (i = 0; i < nb_iterations; i++) {
1072 		__float32_to_uint64_neon_u64x2(scale, input_buffer, output_buffer);
1073 		input_buffer += vlen;
1074 		output_buffer += vlen;
1075 	}
1076 
1077 	/* convert leftover elements */
1078 	i = i * vlen;
1079 	for (; i < nb_elements; i++) {
1080 		__float32_to_uint64_neon_u64x1(scale, input_buffer, output_buffer);
1081 		input_buffer++;
1082 		output_buffer++;
1083 	}
1084 
1085 	return 0;
1086 }
1087 
1088 static inline void
1089 __uint64_to_float32_neon_f32x2(float scale, uint64_t *input, float *output)
1090 {
1091 	uint64x2_t u64x2;
1092 	float64x2_t f64x2;
1093 	float32x2_t f32x2;
1094 
1095 	/* load 2 x int64_t elements */
1096 	u64x2 = vld1q_u64(input);
1097 
1098 	/* convert int64x2_t to float64x2_t */
1099 	f64x2 = vcvtq_f64_u64(u64x2);
1100 
1101 	/* convert float64x2_t to float32x2_t */
1102 	f32x2 = vcvt_f32_f64(f64x2);
1103 
1104 	/* scale */
1105 	f32x2 = vmul_n_f32(f32x2, scale);
1106 
1107 	/* store float32x2_t */
1108 	vst1_f32(output, f32x2);
1109 }
1110 
1111 static inline void
1112 __uint64_to_float32_neon_f32x1(float scale, uint64_t *input, float *output)
1113 {
1114 	uint64x2_t u64x2;
1115 	float64x2_t f64x2;
1116 	float32x2_t f32x2;
1117 
1118 	/* load 2 x int64_t elements */
1119 	u64x2 = vld1q_lane_u64(input, vdupq_n_u64(0), 0);
1120 
1121 	/* convert int64x2_t to float64x2_t */
1122 	f64x2 = vcvtq_f64_u64(u64x2);
1123 
1124 	/* convert float64x2_t to float32x2_t */
1125 	f32x2 = vcvt_f32_f64(f64x2);
1126 
1127 	/* scale */
1128 	f32x2 = vmul_n_f32(f32x2, scale);
1129 
1130 	/* store float32x2_t */
1131 	vst1_lane_f32(output, f32x2, 0);
1132 }
1133 
1134 int
1135 rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
1136 {
1137 	uint64_t *input_buffer;
1138 	float *output_buffer;
1139 	uint64_t nb_iterations;
1140 	uint32_t vlen;
1141 	uint64_t i;
1142 
1143 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
1144 		return -EINVAL;
1145 
1146 	input_buffer = (uint64_t *)input;
1147 	output_buffer = (float *)output;
1148 	vlen = 4 * sizeof(float) / sizeof(uint64_t);
1149 	nb_iterations = nb_elements / vlen;
1150 
1151 	/* convert vlen elements in each iteration */
1152 	for (i = 0; i < nb_iterations; i++) {
1153 		__uint64_to_float32_neon_f32x2(scale, input_buffer, output_buffer);
1154 		input_buffer += vlen;
1155 		output_buffer += vlen;
1156 	}
1157 
1158 	/* convert leftover elements */
1159 	i = i * vlen;
1160 	for (; i < nb_elements; i++) {
1161 		__uint64_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
1162 		input_buffer++;
1163 		output_buffer++;
1164 	}
1165 
1166 	return 0;
1167 }
1168 
1169 static inline void
1170 __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
1171 {
1172 	float32x4_t f32x4;
1173 	float16x4_t f16x4;
1174 
1175 	/* load 4 x float32_t elements */
1176 	f32x4 = vld1q_f32(input);
1177 
1178 	/* convert to float16x4_t */
1179 	f16x4 = vcvt_f16_f32(f32x4);
1180 
1181 	/* store float16x4_t */
1182 	vst1_f16(output, f16x4);
1183 }
1184 
1185 static inline void
1186 __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
1187 {
1188 	float32x4_t f32x4;
1189 	float16x4_t f16x4;
1190 
1191 	/* load element to 4 lanes */
1192 	f32x4 = vld1q_dup_f32(input);
1193 
1194 	/* convert float32_t to float16_t */
1195 	f16x4 = vcvt_f16_f32(f32x4);
1196 
1197 	/* store lane 0 / 1 element */
1198 	vst1_lane_f16(output, f16x4, 0);
1199 }
1200 
1201 int
1202 rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
1203 {
1204 	float32_t *input_buffer;
1205 	float16_t *output_buffer;
1206 	uint64_t nb_iterations;
1207 	uint32_t vlen;
1208 	uint64_t i;
1209 
1210 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1211 		return -EINVAL;
1212 
1213 	input_buffer = (float32_t *)input;
1214 	output_buffer = (float16_t *)output;
1215 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1216 	nb_iterations = nb_elements / vlen;
1217 
1218 	/* convert vlen elements in each iteration */
1219 	for (i = 0; i < nb_iterations; i++) {
1220 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
1221 		input_buffer += vlen;
1222 		output_buffer += vlen;
1223 	}
1224 
1225 	/* convert leftover elements */
1226 	i = i * vlen;
1227 	for (; i < nb_elements; i++) {
1228 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
1229 		input_buffer++;
1230 		output_buffer++;
1231 	}
1232 
1233 	return 0;
1234 }
1235 
1236 static inline void
1237 __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
1238 {
1239 	float16x4_t f16x4;
1240 	float32x4_t f32x4;
1241 
1242 	/* load 4 x float16_t elements */
1243 	f16x4 = vld1_f16(input);
1244 
1245 	/* convert float16x4_t to float32x4_t */
1246 	f32x4 = vcvt_f32_f16(f16x4);
1247 
1248 	/* store float32x4_t */
1249 	vst1q_f32(output, f32x4);
1250 }
1251 
1252 static inline void
1253 __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
1254 {
1255 	float16x4_t f16x4;
1256 	float32x4_t f32x4;
1257 
1258 	/* load element to 4 lanes */
1259 	f16x4 = vld1_dup_f16(input);
1260 
1261 	/* convert float16_t to float32_t */
1262 	f32x4 = vcvt_f32_f16(f16x4);
1263 
1264 	/* store 1 element */
1265 	vst1q_lane_f32(output, f32x4, 0);
1266 }
1267 
1268 int
1269 rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
1270 {
1271 	float16_t *input_buffer;
1272 	float32_t *output_buffer;
1273 	uint64_t nb_iterations;
1274 	uint32_t vlen;
1275 	uint64_t i;
1276 
1277 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
1278 		return -EINVAL;
1279 
1280 	input_buffer = (float16_t *)input;
1281 	output_buffer = (float32_t *)output;
1282 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
1283 	nb_iterations = nb_elements / vlen;
1284 
1285 	/* convert vlen elements in each iteration */
1286 	for (i = 0; i < nb_iterations; i++) {
1287 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
1288 		input_buffer += vlen;
1289 		output_buffer += vlen;
1290 	}
1291 
1292 	/* convert leftover elements */
1293 	i = i * vlen;
1294 	for (; i < nb_elements; i++) {
1295 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
1296 		input_buffer++;
1297 		output_buffer++;
1298 	}
1299 
1300 	return 0;
1301 }
1302