xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision 2bf48044dca1892e571fd4964eecaacf6cb0c1c2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2022 Marvell.
3  */
4 
5 #include <errno.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 
9 #include "mldev_utils.h"
10 
11 #include <arm_neon.h>
12 
13 /* Description:
14  * This file implements vector versions of Machine Learning utility functions used to convert data
15  * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
16  * is based on Arm Neon intrinsics.
17  */
18 
19 static inline void
20 __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
21 {
22 	int16x4_t s16x4_l;
23 	int16x4_t s16x4_h;
24 	float32x4_t f32x4;
25 	int16x8_t s16x8;
26 	int32x4_t s32x4;
27 	int8x8_t s8x8;
28 
29 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30 	 * Use round to nearest with ties away rounding mode.
31 	 */
32 	f32x4 = vld1q_f32(input);
33 	f32x4 = vmulq_n_f32(f32x4, scale);
34 	s32x4 = vcvtaq_s32_f32(f32x4);
35 	s16x4_l = vqmovn_s32(s32x4);
36 
37 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
38 	 * Use round to nearest with ties away rounding mode.
39 	 */
40 	f32x4 = vld1q_f32(input + 4);
41 	f32x4 = vmulq_n_f32(f32x4, scale);
42 	s32x4 = vcvtaq_s32_f32(f32x4);
43 	s16x4_h = vqmovn_s32(s32x4);
44 
45 	/* combine lower and higher int16x4_t to int16x8_t */
46 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
47 
48 	/* narrow to int8_t */
49 	s8x8 = vqmovn_s16(s16x8);
50 
51 	/* store 8 elements */
52 	vst1_s8(output, s8x8);
53 }
54 
55 static inline void
56 __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
57 {
58 	int32_t s32;
59 	int16_t s16;
60 
61 	/* scale and convert, round to nearest with ties away rounding mode */
62 	s32 = vcvtas_s32_f32(scale * (*input));
63 
64 	/* saturate narrow */
65 	s16 = vqmovns_s32(s32);
66 
67 	/* convert to int8_t */
68 	*output = vqmovnh_s16(s16);
69 }
70 
71 int
72 rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
73 {
74 	float *input_buffer;
75 	int8_t *output_buffer;
76 	uint64_t nb_iterations;
77 	uint32_t vlen;
78 	uint64_t i;
79 
80 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
81 		return -EINVAL;
82 
83 	input_buffer = (float *)input;
84 	output_buffer = (int8_t *)output;
85 	vlen = 2 * sizeof(float) / sizeof(int8_t);
86 	nb_iterations = nb_elements / vlen;
87 
88 	/* convert vlen elements in each iteration */
89 	for (i = 0; i < nb_iterations; i++) {
90 		__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
91 		input_buffer += vlen;
92 		output_buffer += vlen;
93 	}
94 
95 	/* convert leftover elements */
96 	i = i * vlen;
97 	for (; i < nb_elements; i++) {
98 		__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
99 		input_buffer++;
100 		output_buffer++;
101 	}
102 
103 	return 0;
104 }
105 
106 static inline void
107 __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
108 {
109 	float32x4_t f32x4;
110 	int16x8_t s16x8;
111 	int16x4_t s16x4;
112 	int32x4_t s32x4;
113 	int8x8_t s8x8;
114 
115 	/* load 8 x int8_t elements */
116 	s8x8 = vld1_s8(input);
117 
118 	/* widen int8_t to int16_t */
119 	s16x8 = vmovl_s8(s8x8);
120 
121 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
122 	s16x4 = vget_low_s16(s16x8);
123 	s32x4 = vmovl_s16(s16x4);
124 	f32x4 = vcvtq_f32_s32(s32x4);
125 	f32x4 = vmulq_n_f32(f32x4, scale);
126 	vst1q_f32(output, f32x4);
127 
128 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
129 	s16x4 = vget_high_s16(s16x8);
130 	s32x4 = vmovl_s16(s16x4);
131 	f32x4 = vcvtq_f32_s32(s32x4);
132 	f32x4 = vmulq_n_f32(f32x4, scale);
133 	vst1q_f32(output + 4, f32x4);
134 }
135 
136 static inline void
137 __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
138 {
139 	*output = scale * vcvts_f32_s32((int32_t)*input);
140 }
141 
142 int
143 rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
144 {
145 	int8_t *input_buffer;
146 	float *output_buffer;
147 	uint64_t nb_iterations;
148 	uint32_t vlen;
149 	uint64_t i;
150 
151 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
152 		return -EINVAL;
153 
154 	input_buffer = (int8_t *)input;
155 	output_buffer = (float *)output;
156 	vlen = 2 * sizeof(float) / sizeof(int8_t);
157 	nb_iterations = nb_elements / vlen;
158 
159 	/* convert vlen elements in each iteration */
160 	for (i = 0; i < nb_iterations; i++) {
161 		__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
162 		input_buffer += vlen;
163 		output_buffer += vlen;
164 	}
165 
166 	/* convert leftover elements */
167 	i = i * vlen;
168 	for (; i < nb_elements; i++) {
169 		__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
170 		input_buffer++;
171 		output_buffer++;
172 	}
173 
174 	return 0;
175 }
176 
177 static inline void
178 __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
179 {
180 	uint16x4_t u16x4_l;
181 	uint16x4_t u16x4_h;
182 	float32x4_t f32x4;
183 	uint32x4_t u32x4;
184 	uint16x8_t u16x8;
185 	uint8x8_t u8x8;
186 
187 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
188 	 * use round to nearest with ties away rounding mode.
189 	 */
190 	f32x4 = vld1q_f32(input);
191 	f32x4 = vmulq_n_f32(f32x4, scale);
192 	u32x4 = vcvtaq_u32_f32(f32x4);
193 	u16x4_l = vqmovn_u32(u32x4);
194 
195 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
196 	 * use round to nearest with ties away rounding mode.
197 	 */
198 	f32x4 = vld1q_f32(input + 4);
199 	f32x4 = vmulq_n_f32(f32x4, scale);
200 	u32x4 = vcvtaq_u32_f32(f32x4);
201 	u16x4_h = vqmovn_u32(u32x4);
202 
203 	/* combine lower and higher uint16x4_t */
204 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
205 
206 	/* narrow to uint8x8_t */
207 	u8x8 = vqmovn_u16(u16x8);
208 
209 	/* store 8 elements */
210 	vst1_u8(output, u8x8);
211 }
212 
213 static inline void
214 __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
215 {
216 	uint32_t u32;
217 	uint16_t u16;
218 
219 	/* scale and convert, round to nearest with ties away rounding mode */
220 	u32 = vcvtas_u32_f32(scale * (*input));
221 
222 	/* saturate narrow */
223 	u16 = vqmovns_u32(u32);
224 
225 	/* convert to uint8_t */
226 	*output = vqmovnh_u16(u16);
227 }
228 
229 int
230 rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
231 {
232 	float *input_buffer;
233 	uint8_t *output_buffer;
234 	uint64_t nb_iterations;
235 	uint32_t vlen;
236 	uint64_t i;
237 
238 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
239 		return -EINVAL;
240 
241 	input_buffer = (float *)input;
242 	output_buffer = (uint8_t *)output;
243 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
244 	nb_iterations = nb_elements / vlen;
245 
246 	/* convert vlen elements in each iteration */
247 	for (i = 0; i < nb_iterations; i++) {
248 		__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
249 		input_buffer += vlen;
250 		output_buffer += vlen;
251 	}
252 
253 	/* convert leftover elements */
254 	i = i * vlen;
255 	for (; i < nb_elements; i++) {
256 		__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
257 		input_buffer++;
258 		output_buffer++;
259 	}
260 
261 	return 0;
262 }
263 
264 static inline void
265 __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
266 {
267 	float32x4_t f32x4;
268 	uint16x8_t u16x8;
269 	uint16x4_t u16x4;
270 	uint32x4_t u32x4;
271 	uint8x8_t u8x8;
272 
273 	/* load 8 x uint8_t elements */
274 	u8x8 = vld1_u8(input);
275 
276 	/* widen uint8_t to uint16_t */
277 	u16x8 = vmovl_u8(u8x8);
278 
279 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
280 	u16x4 = vget_low_u16(u16x8);
281 	u32x4 = vmovl_u16(u16x4);
282 	f32x4 = vcvtq_f32_u32(u32x4);
283 	f32x4 = vmulq_n_f32(f32x4, scale);
284 	vst1q_f32(output, f32x4);
285 
286 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
287 	u16x4 = vget_high_u16(u16x8);
288 	u32x4 = vmovl_u16(u16x4);
289 	f32x4 = vcvtq_f32_u32(u32x4);
290 	f32x4 = vmulq_n_f32(f32x4, scale);
291 	vst1q_f32(output + 4, f32x4);
292 }
293 
294 static inline void
295 __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
296 {
297 	*output = scale * vcvts_f32_u32((uint32_t)*input);
298 }
299 
300 int
301 rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
302 {
303 	uint8_t *input_buffer;
304 	float *output_buffer;
305 	uint64_t nb_iterations;
306 	uint64_t vlen;
307 	uint64_t i;
308 
309 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
310 		return -EINVAL;
311 
312 	input_buffer = (uint8_t *)input;
313 	output_buffer = (float *)output;
314 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
315 	nb_iterations = nb_elements / vlen;
316 
317 	/* convert vlen elements in each iteration */
318 	for (i = 0; i < nb_iterations; i++) {
319 		__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
320 		input_buffer += vlen;
321 		output_buffer += vlen;
322 	}
323 
324 	/* convert leftover elements */
325 	i = i * vlen;
326 	for (; i < nb_elements; i++) {
327 		__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
328 		input_buffer++;
329 		output_buffer++;
330 	}
331 
332 	return 0;
333 }
334 
335 static inline void
336 __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
337 {
338 	float32x4_t f32x4;
339 	int16x4_t s16x4;
340 	int32x4_t s32x4;
341 
342 	/* load 4 x float elements */
343 	f32x4 = vld1q_f32(input);
344 
345 	/* scale */
346 	f32x4 = vmulq_n_f32(f32x4, scale);
347 
348 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
349 	s32x4 = vcvtaq_s32_f32(f32x4);
350 
351 	/* saturate narrow to int16x4_t */
352 	s16x4 = vqmovn_s32(s32x4);
353 
354 	/* store 4 elements */
355 	vst1_s16(output, s16x4);
356 }
357 
358 static inline void
359 __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
360 {
361 	int32_t s32;
362 
363 	/* scale and convert, round to nearest with ties away rounding mode */
364 	s32 = vcvtas_s32_f32(scale * (*input));
365 
366 	/* saturate narrow */
367 	*output = vqmovns_s32(s32);
368 }
369 
370 int
371 rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
372 {
373 	float *input_buffer;
374 	int16_t *output_buffer;
375 	uint64_t nb_iterations;
376 	uint32_t vlen;
377 	uint64_t i;
378 
379 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
380 		return -EINVAL;
381 
382 	input_buffer = (float *)input;
383 	output_buffer = (int16_t *)output;
384 	vlen = 2 * sizeof(float) / sizeof(int16_t);
385 	nb_iterations = nb_elements / vlen;
386 
387 	/* convert vlen elements in each iteration */
388 	for (i = 0; i < nb_iterations; i++) {
389 		__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
390 		input_buffer += vlen;
391 		output_buffer += vlen;
392 	}
393 
394 	/* convert leftover elements */
395 	i = i * vlen;
396 	for (; i < nb_elements; i++) {
397 		__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
398 		input_buffer++;
399 		output_buffer++;
400 	}
401 
402 	return 0;
403 }
404 
405 static inline void
406 __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
407 {
408 	float32x4_t f32x4;
409 	int16x4_t s16x4;
410 	int32x4_t s32x4;
411 
412 	/* load 4 x int16_t elements */
413 	s16x4 = vld1_s16(input);
414 
415 	/* widen int16_t to int32_t */
416 	s32x4 = vmovl_s16(s16x4);
417 
418 	/* convert int32_t to float */
419 	f32x4 = vcvtq_f32_s32(s32x4);
420 
421 	/* scale */
422 	f32x4 = vmulq_n_f32(f32x4, scale);
423 
424 	/* store float32x4_t */
425 	vst1q_f32(output, f32x4);
426 }
427 
428 static inline void
429 __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
430 {
431 	*output = scale * vcvts_f32_s32((int32_t)*input);
432 }
433 
434 int
435 rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
436 {
437 	int16_t *input_buffer;
438 	float *output_buffer;
439 	uint64_t nb_iterations;
440 	uint32_t vlen;
441 	uint64_t i;
442 
443 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
444 		return -EINVAL;
445 
446 	input_buffer = (int16_t *)input;
447 	output_buffer = (float *)output;
448 	vlen = 2 * sizeof(float) / sizeof(int16_t);
449 	nb_iterations = nb_elements / vlen;
450 
451 	/* convert vlen elements in each iteration */
452 	for (i = 0; i < nb_iterations; i++) {
453 		__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
454 		input_buffer += vlen;
455 		output_buffer += vlen;
456 	}
457 
458 	/* convert leftover elements */
459 	i = i * vlen;
460 	for (; i < nb_elements; i++) {
461 		__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
462 		input_buffer++;
463 		output_buffer++;
464 	}
465 
466 	return 0;
467 }
468 
469 static inline void
470 __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
471 {
472 	float32x4_t f32x4;
473 	uint16x4_t u16x4;
474 	uint32x4_t u32x4;
475 
476 	/* load 4 float elements */
477 	f32x4 = vld1q_f32(input);
478 
479 	/* scale */
480 	f32x4 = vmulq_n_f32(f32x4, scale);
481 
482 	/* convert using round to nearest with ties to away rounding mode */
483 	u32x4 = vcvtaq_u32_f32(f32x4);
484 
485 	/* saturate narrow */
486 	u16x4 = vqmovn_u32(u32x4);
487 
488 	/* store 4 elements */
489 	vst1_u16(output, u16x4);
490 }
491 
492 static inline void
493 __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
494 {
495 	uint32_t u32;
496 
497 	/* scale and convert, round to nearest with ties away rounding mode */
498 	u32 = vcvtas_u32_f32(scale * (*input));
499 
500 	/* saturate narrow */
501 	*output = vqmovns_u32(u32);
502 }
503 
504 int
505 rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
506 {
507 	float *input_buffer;
508 	uint16_t *output_buffer;
509 	uint64_t nb_iterations;
510 	uint64_t vlen;
511 	uint64_t i;
512 
513 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
514 		return -EINVAL;
515 
516 	input_buffer = (float *)input;
517 	output_buffer = (uint16_t *)output;
518 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
519 	nb_iterations = nb_elements / vlen;
520 
521 	/* convert vlen elements in each iteration */
522 	for (i = 0; i < nb_iterations; i++) {
523 		__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
524 		input_buffer += vlen;
525 		output_buffer += vlen;
526 	}
527 
528 	/* convert leftover elements */
529 	i = i * vlen;
530 	for (; i < nb_elements; i++) {
531 		__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
532 		input_buffer++;
533 		output_buffer++;
534 	}
535 
536 	return 0;
537 }
538 
539 static inline void
540 __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
541 {
542 	float32x4_t f32x4;
543 	uint16x4_t u16x4;
544 	uint32x4_t u32x4;
545 
546 	/* load 4 x uint16_t elements */
547 	u16x4 = vld1_u16(input);
548 
549 	/* widen uint16_t to uint32_t */
550 	u32x4 = vmovl_u16(u16x4);
551 
552 	/* convert uint32_t to float */
553 	f32x4 = vcvtq_f32_u32(u32x4);
554 
555 	/* scale */
556 	f32x4 = vmulq_n_f32(f32x4, scale);
557 
558 	/* store float32x4_t */
559 	vst1q_f32(output, f32x4);
560 }
561 
562 static inline void
563 __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
564 {
565 	*output = scale * vcvts_f32_u32((uint32_t)*input);
566 }
567 
568 int
569 rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
570 {
571 	uint16_t *input_buffer;
572 	float *output_buffer;
573 	uint64_t nb_iterations;
574 	uint32_t vlen;
575 	uint64_t i;
576 
577 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
578 		return -EINVAL;
579 
580 	input_buffer = (uint16_t *)input;
581 	output_buffer = (float *)output;
582 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
583 	nb_iterations = nb_elements / vlen;
584 
585 	/* convert vlen elements in each iteration */
586 	for (i = 0; i < nb_iterations; i++) {
587 		__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
588 		input_buffer += vlen;
589 		output_buffer += vlen;
590 	}
591 
592 	/* convert leftover elements */
593 	i = i * vlen;
594 	for (; i < nb_elements; i++) {
595 		__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
596 		input_buffer++;
597 		output_buffer++;
598 	}
599 
600 	return 0;
601 }
602 
603 static inline void
604 __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
605 {
606 	float32x4_t f32x4;
607 	float16x4_t f16x4;
608 
609 	/* load 4 x float32_t elements */
610 	f32x4 = vld1q_f32(input);
611 
612 	/* convert to float16x4_t */
613 	f16x4 = vcvt_f16_f32(f32x4);
614 
615 	/* store float16x4_t */
616 	vst1_f16(output, f16x4);
617 }
618 
619 static inline void
620 __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
621 {
622 	float32x4_t f32x4;
623 	float16x4_t f16x4;
624 
625 	/* load element to 4 lanes */
626 	f32x4 = vld1q_dup_f32(input);
627 
628 	/* convert float32_t to float16_t */
629 	f16x4 = vcvt_f16_f32(f32x4);
630 
631 	/* store lane 0 / 1 element */
632 	vst1_lane_f16(output, f16x4, 0);
633 }
634 
635 int
636 rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
637 {
638 	float32_t *input_buffer;
639 	float16_t *output_buffer;
640 	uint64_t nb_iterations;
641 	uint32_t vlen;
642 	uint64_t i;
643 
644 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
645 		return -EINVAL;
646 
647 	input_buffer = (float32_t *)input;
648 	output_buffer = (float16_t *)output;
649 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
650 	nb_iterations = nb_elements / vlen;
651 
652 	/* convert vlen elements in each iteration */
653 	for (i = 0; i < nb_iterations; i++) {
654 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
655 		input_buffer += vlen;
656 		output_buffer += vlen;
657 	}
658 
659 	/* convert leftover elements */
660 	i = i * vlen;
661 	for (; i < nb_elements; i++) {
662 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
663 		input_buffer++;
664 		output_buffer++;
665 	}
666 
667 	return 0;
668 }
669 
670 static inline void
671 __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
672 {
673 	float16x4_t f16x4;
674 	float32x4_t f32x4;
675 
676 	/* load 4 x float16_t elements */
677 	f16x4 = vld1_f16(input);
678 
679 	/* convert float16x4_t to float32x4_t */
680 	f32x4 = vcvt_f32_f16(f16x4);
681 
682 	/* store float32x4_t */
683 	vst1q_f32(output, f32x4);
684 }
685 
686 static inline void
687 __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
688 {
689 	float16x4_t f16x4;
690 	float32x4_t f32x4;
691 
692 	/* load element to 4 lanes */
693 	f16x4 = vld1_dup_f16(input);
694 
695 	/* convert float16_t to float32_t */
696 	f32x4 = vcvt_f32_f16(f16x4);
697 
698 	/* store 1 element */
699 	vst1q_lane_f32(output, f32x4, 0);
700 }
701 
702 int
703 rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
704 {
705 	float16_t *input_buffer;
706 	float32_t *output_buffer;
707 	uint64_t nb_iterations;
708 	uint32_t vlen;
709 	uint64_t i;
710 
711 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
712 		return -EINVAL;
713 
714 	input_buffer = (float16_t *)input;
715 	output_buffer = (float32_t *)output;
716 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
717 	nb_iterations = nb_elements / vlen;
718 
719 	/* convert vlen elements in each iteration */
720 	for (i = 0; i < nb_iterations; i++) {
721 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
722 		input_buffer += vlen;
723 		output_buffer += vlen;
724 	}
725 
726 	/* convert leftover elements */
727 	i = i * vlen;
728 	for (; i < nb_elements; i++) {
729 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
730 		input_buffer++;
731 		output_buffer++;
732 	}
733 
734 	return 0;
735 }
736