xref: /dpdk/lib/mldev/mldev_utils_scalar.c (revision 65282e9f8e118a4ca977d1aee2d7f51f44e9bc1b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2022 Marvell.
3  */
4 
5 #include "mldev_utils_scalar.h"
6 
7 /* Description:
8  * This file implements scalar versions of Machine Learning utility functions used to convert data
9  * types from higher precision to lower precision and vice-versa, except bfloat16.
10  */
11 
12 int
13 rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t nb_elements, float scale,
14 			  int8_t zero_point)
15 {
16 	const float *input_buffer;
17 	int8_t *output_buffer;
18 	uint64_t i;
19 	int i32;
20 
21 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
22 		return -EINVAL;
23 
24 	input_buffer = (const float *)input;
25 	output_buffer = (int8_t *)output;
26 
27 	for (i = 0; i < nb_elements; i++) {
28 		i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
29 
30 		if (i32 < INT8_MIN)
31 			i32 = INT8_MIN;
32 
33 		if (i32 > INT8_MAX)
34 			i32 = INT8_MAX;
35 
36 		*output_buffer = (int8_t)i32;
37 
38 		input_buffer++;
39 		output_buffer++;
40 	}
41 
42 	return 0;
43 }
44 
45 int
46 rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
47 			  int8_t zero_point)
48 {
49 	const int8_t *input_buffer;
50 	float *output_buffer;
51 	uint64_t i;
52 
53 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
54 		return -EINVAL;
55 
56 	input_buffer = (const int8_t *)input;
57 	output_buffer = (float *)output;
58 
59 	for (i = 0; i < nb_elements; i++) {
60 		*output_buffer = scale * (float)(*input_buffer - zero_point);
61 
62 		input_buffer++;
63 		output_buffer++;
64 	}
65 
66 	return 0;
67 }
68 
69 int
70 rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t nb_elements, float scale,
71 			   uint8_t zero_point)
72 {
73 	const float *input_buffer;
74 	uint8_t *output_buffer;
75 	int32_t i32;
76 	uint64_t i;
77 
78 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
79 		return -EINVAL;
80 
81 	input_buffer = (const float *)input;
82 	output_buffer = (uint8_t *)output;
83 
84 	for (i = 0; i < nb_elements; i++) {
85 		i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
86 
87 		if (i32 < 0)
88 			i32 = 0;
89 
90 		if (i32 > UINT8_MAX)
91 			i32 = UINT8_MAX;
92 
93 		*output_buffer = (uint8_t)i32;
94 
95 		input_buffer++;
96 		output_buffer++;
97 	}
98 
99 	return 0;
100 }
101 
102 int
103 rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
104 			   uint8_t zero_point)
105 {
106 	const uint8_t *input_buffer;
107 	float *output_buffer;
108 	uint64_t i;
109 
110 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
111 		return -EINVAL;
112 
113 	input_buffer = (const uint8_t *)input;
114 	output_buffer = (float *)output;
115 
116 	for (i = 0; i < nb_elements; i++) {
117 		*output_buffer = scale * (float)(*input_buffer - zero_point);
118 
119 		input_buffer++;
120 		output_buffer++;
121 	}
122 
123 	return 0;
124 }
125 
126 int
127 rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t nb_elements, float scale,
128 			   int16_t zero_point)
129 {
130 	const float *input_buffer;
131 	int16_t *output_buffer;
132 	int32_t i32;
133 	uint64_t i;
134 
135 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
136 		return -EINVAL;
137 
138 	input_buffer = (const float *)input;
139 	output_buffer = (int16_t *)output;
140 
141 	for (i = 0; i < nb_elements; i++) {
142 		i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
143 
144 		if (i32 < INT16_MIN)
145 			i32 = INT16_MIN;
146 
147 		if (i32 > INT16_MAX)
148 			i32 = INT16_MAX;
149 
150 		*output_buffer = (int16_t)i32;
151 
152 		input_buffer++;
153 		output_buffer++;
154 	}
155 
156 	return 0;
157 }
158 
159 int
160 rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
161 			   int16_t zero_point)
162 {
163 	const int16_t *input_buffer;
164 	float *output_buffer;
165 	uint64_t i;
166 
167 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
168 		return -EINVAL;
169 
170 	input_buffer = (const int16_t *)input;
171 	output_buffer = (float *)output;
172 
173 	for (i = 0; i < nb_elements; i++) {
174 		*output_buffer = scale * (float)(*input_buffer - zero_point);
175 
176 		input_buffer++;
177 		output_buffer++;
178 	}
179 
180 	return 0;
181 }
182 
183 int
184 rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t nb_elements, float scale,
185 			    uint16_t zero_point)
186 {
187 	const float *input_buffer;
188 	uint16_t *output_buffer;
189 	int32_t i32;
190 	uint64_t i;
191 
192 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
193 		return -EINVAL;
194 
195 	input_buffer = (const float *)input;
196 	output_buffer = (uint16_t *)output;
197 
198 	for (i = 0; i < nb_elements; i++) {
199 		i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
200 
201 		if (i32 < 0)
202 			i32 = 0;
203 
204 		if (i32 > UINT16_MAX)
205 			i32 = UINT16_MAX;
206 
207 		*output_buffer = (uint16_t)i32;
208 
209 		input_buffer++;
210 		output_buffer++;
211 	}
212 
213 	return 0;
214 }
215 
216 int
217 rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
218 			    uint16_t zero_point)
219 {
220 	const uint16_t *input_buffer;
221 	float *output_buffer;
222 	uint64_t i;
223 
224 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
225 		return -EINVAL;
226 
227 	input_buffer = (const uint16_t *)input;
228 	output_buffer = (float *)output;
229 
230 	for (i = 0; i < nb_elements; i++) {
231 		*output_buffer = scale * (float)(*input_buffer - zero_point);
232 
233 		input_buffer++;
234 		output_buffer++;
235 	}
236 
237 	return 0;
238 }
239 
240 int
241 rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t nb_elements, float scale,
242 			   int32_t zero_point)
243 {
244 	const float *input_buffer;
245 	int32_t *output_buffer;
246 	uint64_t i;
247 
248 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
249 		return -EINVAL;
250 
251 	input_buffer = (const float *)input;
252 	output_buffer = (int32_t *)output;
253 
254 	for (i = 0; i < nb_elements; i++) {
255 		*output_buffer = (int32_t)(round(*input_buffer / scale) + zero_point);
256 
257 		input_buffer++;
258 		output_buffer++;
259 	}
260 
261 	return 0;
262 }
263 
264 int
265 rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
266 			   int32_t zero_point)
267 {
268 	const int32_t *input_buffer;
269 	float *output_buffer;
270 	uint64_t i;
271 
272 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
273 		return -EINVAL;
274 
275 	input_buffer = (const int32_t *)input;
276 	output_buffer = (float *)output;
277 
278 	for (i = 0; i < nb_elements; i++) {
279 		*output_buffer = scale * (float)(*input_buffer - zero_point);
280 
281 		input_buffer++;
282 		output_buffer++;
283 	}
284 
285 	return 0;
286 }
287 
288 int
289 rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t nb_elements, float scale,
290 			    uint32_t zero_point)
291 {
292 	const float *input_buffer;
293 	uint32_t *output_buffer;
294 	int32_t i32;
295 	uint64_t i;
296 
297 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
298 		return -EINVAL;
299 
300 	input_buffer = (const float *)input;
301 	output_buffer = (uint32_t *)output;
302 
303 	for (i = 0; i < nb_elements; i++) {
304 		i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
305 
306 		if (i32 < 0)
307 			i32 = 0;
308 
309 		*output_buffer = (uint32_t)i32;
310 
311 		input_buffer++;
312 		output_buffer++;
313 	}
314 
315 	return 0;
316 }
317 
318 int
319 rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
320 			    uint32_t zero_point)
321 {
322 	const uint32_t *input_buffer;
323 	float *output_buffer;
324 	uint64_t i;
325 
326 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
327 		return -EINVAL;
328 
329 	input_buffer = (const uint32_t *)input;
330 	output_buffer = (float *)output;
331 
332 	for (i = 0; i < nb_elements; i++) {
333 		*output_buffer = scale * (float)(*input_buffer - zero_point);
334 
335 		input_buffer++;
336 		output_buffer++;
337 	}
338 
339 	return 0;
340 }
341 
342 int
343 rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t nb_elements, float scale,
344 			   int64_t zero_point)
345 {
346 	const float *input_buffer;
347 	int64_t *output_buffer;
348 	uint64_t i;
349 
350 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
351 		return -EINVAL;
352 
353 	input_buffer = (const float *)input;
354 	output_buffer = (int64_t *)output;
355 
356 	for (i = 0; i < nb_elements; i++) {
357 		*output_buffer = (int64_t)(round(*input_buffer / scale) + zero_point);
358 
359 		input_buffer++;
360 		output_buffer++;
361 	}
362 
363 	return 0;
364 }
365 
366 int
367 rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
368 			   int64_t zero_point)
369 {
370 	const int64_t *input_buffer;
371 	float *output_buffer;
372 	uint64_t i;
373 
374 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
375 		return -EINVAL;
376 
377 	input_buffer = (const int64_t *)input;
378 	output_buffer = (float *)output;
379 
380 	for (i = 0; i < nb_elements; i++) {
381 		*output_buffer = scale * (float)(*input_buffer - zero_point);
382 
383 		input_buffer++;
384 		output_buffer++;
385 	}
386 
387 	return 0;
388 }
389 
390 int
391 rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t nb_elements, float scale,
392 			    uint64_t zero_point)
393 {
394 	const float *input_buffer;
395 	uint64_t *output_buffer;
396 	int64_t i64;
397 	uint64_t i;
398 
399 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
400 		return -EINVAL;
401 
402 	input_buffer = (const float *)input;
403 	output_buffer = (uint64_t *)output;
404 
405 	for (i = 0; i < nb_elements; i++) {
406 		i64 = (int64_t)(round(*input_buffer / scale) + zero_point);
407 
408 		if (i64 < 0)
409 			i64 = 0;
410 
411 		*output_buffer = (uint64_t)i64;
412 
413 		input_buffer++;
414 		output_buffer++;
415 	}
416 
417 	return 0;
418 }
419 
420 int
421 rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t nb_elements, float scale,
422 			    uint64_t zero_point)
423 {
424 	const uint64_t *input_buffer;
425 	float *output_buffer;
426 	uint64_t i;
427 
428 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
429 		return -EINVAL;
430 
431 	input_buffer = (const uint64_t *)input;
432 	output_buffer = (float *)output;
433 
434 	for (i = 0; i < nb_elements; i++) {
435 		*output_buffer = scale * (float)(*input_buffer - zero_point);
436 
437 		input_buffer++;
438 		output_buffer++;
439 	}
440 
441 	return 0;
442 }
443 
444 /* Convert a single precision floating point number (float32) into a half precision
445  * floating point number (float16) using round to nearest rounding mode.
446  */
447 static uint16_t
448 __float32_to_float16_scalar_rtn(float x)
449 {
450 	union float32 f32; /* float32 input */
451 	uint32_t f32_s;	   /* float32 sign */
452 	uint32_t f32_e;	   /* float32 exponent */
453 	uint32_t f32_m;	   /* float32 mantissa */
454 	uint16_t f16_s;	   /* float16 sign */
455 	uint16_t f16_e;	   /* float16 exponent */
456 	uint16_t f16_m;	   /* float16 mantissa */
457 	uint32_t tbits;	   /* number of truncated bits */
458 	uint32_t tmsb;	   /* MSB position of truncated bits */
459 	uint32_t m_32;	   /* temporary float32 mantissa */
460 	uint16_t m_16;	   /* temporary float16 mantissa */
461 	uint16_t u16;	   /* float16 output */
462 	int be_16;	   /* float16 biased exponent, signed */
463 
464 	f32.f = x;
465 	f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;
466 	f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;
467 	f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;
468 
469 	f16_s = f32_s;
470 	f16_e = 0;
471 	f16_m = 0;
472 
473 	switch (f32_e) {
474 	case (0): /* float32: zero or subnormal number */
475 		f16_e = 0;
476 		f16_m = 0; /* convert to zero */
477 		break;
478 	case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */
479 		f16_e = FP16_MASK_E >> FP16_LSB_E;
480 		if (f32_m == 0) { /* infinity */
481 			f16_m = 0;
482 		} else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
483 			f16_m = f32_m >> (FP32_MSB_M - FP16_MSB_M);
484 			f16_m |= BIT(FP16_MSB_M);
485 		}
486 		break;
487 	default: /* float32: normal number */
488 		/* compute biased exponent for float16 */
489 		be_16 = (int)f32_e - FP32_BIAS_E + FP16_BIAS_E;
490 
491 		/* overflow, be_16 = [31-INF], set to infinity */
492 		if (be_16 >= (int)(FP16_MASK_E >> FP16_LSB_E)) {
493 			f16_e = FP16_MASK_E >> FP16_LSB_E;
494 			f16_m = 0;
495 		} else if ((be_16 >= 1) && (be_16 < (int)(FP16_MASK_E >> FP16_LSB_E))) {
496 			/* normal float16, be_16 = [1:30]*/
497 			f16_e = be_16;
498 			m_16 = f32_m >> (FP32_LSB_E - FP16_LSB_E);
499 			tmsb = FP32_MSB_M - FP16_MSB_M - 1;
500 			if ((f32_m & GENMASK_U32(tmsb, 0)) > BIT(tmsb)) {
501 				/* round: non-zero truncated bits except MSB */
502 				m_16++;
503 
504 				/* overflow into exponent */
505 				if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1)
506 					f16_e++;
507 			} else if ((f32_m & GENMASK_U32(tmsb, 0)) == BIT(tmsb)) {
508 				/* round: MSB of truncated bits and LSB of m_16 is set */
509 				if ((m_16 & 0x1) == 0x1) {
510 					m_16++;
511 
512 					/* overflow into exponent */
513 					if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1)
514 						f16_e++;
515 				}
516 			}
517 			f16_m = m_16 & FP16_MASK_M;
518 		} else if ((be_16 >= -(int)(FP16_MSB_M)) && (be_16 < 1)) {
519 			/* underflow: zero / subnormal, be_16 = [-9:0] */
520 			f16_e = 0;
521 
522 			/* add implicit leading zero */
523 			m_32 = f32_m | BIT(FP32_LSB_E);
524 			tbits = FP32_LSB_E - FP16_LSB_E - be_16 + 1;
525 			m_16 = m_32 >> tbits;
526 
527 			/* if non-leading truncated bits are set */
528 			if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {
529 				m_16++;
530 
531 				/* overflow into exponent */
532 				if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1)
533 					f16_e++;
534 			} else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {
535 				/* if leading truncated bit is set */
536 				if ((m_16 & 0x1) == 0x1) {
537 					m_16++;
538 
539 					/* overflow into exponent */
540 					if (((m_16 & FP16_MASK_E) >> FP16_LSB_E) == 0x1)
541 						f16_e++;
542 				}
543 			}
544 			f16_m = m_16 & FP16_MASK_M;
545 		} else if (be_16 == -(int)(FP16_MSB_M + 1)) {
546 			/* underflow: zero, be_16 = [-10] */
547 			f16_e = 0;
548 			if (f32_m != 0)
549 				f16_m = 1;
550 			else
551 				f16_m = 0;
552 		} else {
553 			/* underflow: zero, be_16 = [-INF:-11] */
554 			f16_e = 0;
555 			f16_m = 0;
556 		}
557 
558 		break;
559 	}
560 
561 	u16 = FP16_PACK(f16_s, f16_e, f16_m);
562 
563 	return u16;
564 }
565 
566 int
567 rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t nb_elements)
568 {
569 	const float *input_buffer;
570 	uint16_t *output_buffer;
571 	uint64_t i;
572 
573 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
574 		return -EINVAL;
575 
576 	input_buffer = (const float *)input;
577 	output_buffer = (uint16_t *)output;
578 
579 	for (i = 0; i < nb_elements; i++) {
580 		*output_buffer = __float32_to_float16_scalar_rtn(*input_buffer);
581 
582 		input_buffer = input_buffer + 1;
583 		output_buffer = output_buffer + 1;
584 	}
585 
586 	return 0;
587 }
588 
589 /* Convert a half precision floating point number (float16) into a single precision
590  * floating point number (float32).
591  */
592 static float
593 __float16_to_float32_scalar_rtx(uint16_t f16)
594 {
595 	union float32 f32; /* float32 output */
596 	uint16_t f16_s;	   /* float16 sign */
597 	uint16_t f16_e;	   /* float16 exponent */
598 	uint16_t f16_m;	   /* float16 mantissa */
599 	uint32_t f32_s;	   /* float32 sign */
600 	uint32_t f32_e;	   /* float32 exponent */
601 	uint32_t f32_m;	   /* float32 mantissa*/
602 	uint8_t shift;	   /* number of bits to be shifted */
603 	uint32_t clz;	   /* count of leading zeroes */
604 	int e_16;	   /* float16 exponent unbiased */
605 
606 	f16_s = (f16 & FP16_MASK_S) >> FP16_LSB_S;
607 	f16_e = (f16 & FP16_MASK_E) >> FP16_LSB_E;
608 	f16_m = (f16 & FP16_MASK_M) >> FP16_LSB_M;
609 
610 	f32_s = f16_s;
611 	switch (f16_e) {
612 	case (FP16_MASK_E >> FP16_LSB_E): /* float16: infinity or nan */
613 		f32_e = FP32_MASK_E >> FP32_LSB_E;
614 		if (f16_m == 0x0) { /* infinity */
615 			f32_m = f16_m;
616 		} else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
617 			f32_m = f16_m;
618 			shift = FP32_MSB_M - FP16_MSB_M;
619 			f32_m = (f32_m << shift) & FP32_MASK_M;
620 			f32_m |= BIT(FP32_MSB_M);
621 		}
622 		break;
623 	case 0: /* float16: zero or sub-normal */
624 		f32_m = f16_m;
625 		if (f16_m == 0) { /* zero signed */
626 			f32_e = 0;
627 		} else { /* subnormal numbers */
628 			clz = rte_clz32((uint32_t)f16_m) - sizeof(uint32_t) * 8 + FP16_LSB_E;
629 			e_16 = (int)f16_e - clz;
630 			f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E;
631 
632 			shift = clz + (FP32_MSB_M - FP16_MSB_M) + 1;
633 			f32_m = (f32_m << shift) & FP32_MASK_M;
634 		}
635 		break;
636 	default: /* normal numbers */
637 		f32_m = f16_m;
638 		e_16 = (int)f16_e;
639 		f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E;
640 
641 		shift = (FP32_MSB_M - FP16_MSB_M);
642 		f32_m = (f32_m << shift) & FP32_MASK_M;
643 	}
644 
645 	f32.u = FP32_PACK(f32_s, f32_e, f32_m);
646 
647 	return f32.f;
648 }
649 
650 int
651 rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t nb_elements)
652 {
653 	const uint16_t *input_buffer;
654 	float *output_buffer;
655 	uint64_t i;
656 
657 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
658 		return -EINVAL;
659 
660 	input_buffer = (const uint16_t *)input;
661 	output_buffer = (float *)output;
662 
663 	for (i = 0; i < nb_elements; i++) {
664 		*output_buffer = __float16_to_float32_scalar_rtx(*input_buffer);
665 
666 		input_buffer = input_buffer + 1;
667 		output_buffer = output_buffer + 1;
668 	}
669 
670 	return 0;
671 }
672