xref: /dpdk/lib/mldev/mldev_utils_neon.c (revision fc54766b1612b29c75fd39bd015f27dd57feec5f)
1*fc54766bSSrikanth Yalavarthi /* SPDX-License-Identifier: BSD-3-Clause
2*fc54766bSSrikanth Yalavarthi  * Copyright (c) 2022 Marvell.
3*fc54766bSSrikanth Yalavarthi  */
4*fc54766bSSrikanth Yalavarthi 
5*fc54766bSSrikanth Yalavarthi #include <errno.h>
6*fc54766bSSrikanth Yalavarthi #include <stdint.h>
7*fc54766bSSrikanth Yalavarthi #include <stdlib.h>
8*fc54766bSSrikanth Yalavarthi 
9*fc54766bSSrikanth Yalavarthi #include "mldev_utils.h"
10*fc54766bSSrikanth Yalavarthi 
11*fc54766bSSrikanth Yalavarthi #include <arm_neon.h>
12*fc54766bSSrikanth Yalavarthi 
13*fc54766bSSrikanth Yalavarthi /* Description:
14*fc54766bSSrikanth Yalavarthi  * This file implements vector versions of Machine Learning utility functions used to convert data
15*fc54766bSSrikanth Yalavarthi  * types from higher precision to lower precision and vice-versa. Implementation is based on Arm
16*fc54766bSSrikanth Yalavarthi  * Neon intrinsics.
17*fc54766bSSrikanth Yalavarthi  */
18*fc54766bSSrikanth Yalavarthi 
19*fc54766bSSrikanth Yalavarthi static inline void
20*fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
21*fc54766bSSrikanth Yalavarthi {
22*fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_l;
23*fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4_h;
24*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
25*fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
26*fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
27*fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
28*fc54766bSSrikanth Yalavarthi 
29*fc54766bSSrikanth Yalavarthi 	/* load 4 float32 elements, scale, convert, saturate narrow to int16.
30*fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
31*fc54766bSSrikanth Yalavarthi 	 */
32*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
33*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
34*fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
35*fc54766bSSrikanth Yalavarthi 	s16x4_l = vqmovn_s32(s32x4);
36*fc54766bSSrikanth Yalavarthi 
37*fc54766bSSrikanth Yalavarthi 	/* load next 4 float32 elements, scale, convert, saturate narrow to int16.
38*fc54766bSSrikanth Yalavarthi 	 * Use round to nearest with ties away rounding mode.
39*fc54766bSSrikanth Yalavarthi 	 */
40*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
41*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
42*fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
43*fc54766bSSrikanth Yalavarthi 	s16x4_h = vqmovn_s32(s32x4);
44*fc54766bSSrikanth Yalavarthi 
45*fc54766bSSrikanth Yalavarthi 	/* combine lower and higher int16x4_t to int16x8_t */
46*fc54766bSSrikanth Yalavarthi 	s16x8 = vcombine_s16(s16x4_l, s16x4_h);
47*fc54766bSSrikanth Yalavarthi 
48*fc54766bSSrikanth Yalavarthi 	/* narrow to int8_t */
49*fc54766bSSrikanth Yalavarthi 	s8x8 = vqmovn_s16(s16x8);
50*fc54766bSSrikanth Yalavarthi 
51*fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
52*fc54766bSSrikanth Yalavarthi 	vst1_s8(output, s8x8);
53*fc54766bSSrikanth Yalavarthi }
54*fc54766bSSrikanth Yalavarthi 
55*fc54766bSSrikanth Yalavarthi static inline void
56*fc54766bSSrikanth Yalavarthi __float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
57*fc54766bSSrikanth Yalavarthi {
58*fc54766bSSrikanth Yalavarthi 	int32_t s32;
59*fc54766bSSrikanth Yalavarthi 	int16_t s16;
60*fc54766bSSrikanth Yalavarthi 
61*fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
62*fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
63*fc54766bSSrikanth Yalavarthi 
64*fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
65*fc54766bSSrikanth Yalavarthi 	s16 = vqmovns_s32(s32);
66*fc54766bSSrikanth Yalavarthi 
67*fc54766bSSrikanth Yalavarthi 	/* convert to int8_t */
68*fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_s16(s16);
69*fc54766bSSrikanth Yalavarthi }
70*fc54766bSSrikanth Yalavarthi 
71*fc54766bSSrikanth Yalavarthi int
72*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
73*fc54766bSSrikanth Yalavarthi {
74*fc54766bSSrikanth Yalavarthi 	float *input_buffer;
75*fc54766bSSrikanth Yalavarthi 	int8_t *output_buffer;
76*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
77*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
78*fc54766bSSrikanth Yalavarthi 	uint64_t i;
79*fc54766bSSrikanth Yalavarthi 
80*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
81*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
82*fc54766bSSrikanth Yalavarthi 
83*fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
84*fc54766bSSrikanth Yalavarthi 	output_buffer = (int8_t *)output;
85*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
86*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
87*fc54766bSSrikanth Yalavarthi 
88*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
89*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
90*fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
91*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
92*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
93*fc54766bSSrikanth Yalavarthi 	}
94*fc54766bSSrikanth Yalavarthi 
95*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
96*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
97*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
98*fc54766bSSrikanth Yalavarthi 		__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
99*fc54766bSSrikanth Yalavarthi 		input_buffer++;
100*fc54766bSSrikanth Yalavarthi 		output_buffer++;
101*fc54766bSSrikanth Yalavarthi 	}
102*fc54766bSSrikanth Yalavarthi 
103*fc54766bSSrikanth Yalavarthi 	return 0;
104*fc54766bSSrikanth Yalavarthi }
105*fc54766bSSrikanth Yalavarthi 
106*fc54766bSSrikanth Yalavarthi static inline void
107*fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
108*fc54766bSSrikanth Yalavarthi {
109*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
110*fc54766bSSrikanth Yalavarthi 	int16x8_t s16x8;
111*fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
112*fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
113*fc54766bSSrikanth Yalavarthi 	int8x8_t s8x8;
114*fc54766bSSrikanth Yalavarthi 
115*fc54766bSSrikanth Yalavarthi 	/* load 8 x int8_t elements */
116*fc54766bSSrikanth Yalavarthi 	s8x8 = vld1_s8(input);
117*fc54766bSSrikanth Yalavarthi 
118*fc54766bSSrikanth Yalavarthi 	/* widen int8_t to int16_t */
119*fc54766bSSrikanth Yalavarthi 	s16x8 = vmovl_s8(s8x8);
120*fc54766bSSrikanth Yalavarthi 
121*fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
122*fc54766bSSrikanth Yalavarthi 	s16x4 = vget_low_s16(s16x8);
123*fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
124*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
125*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
126*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
127*fc54766bSSrikanth Yalavarthi 
128*fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
129*fc54766bSSrikanth Yalavarthi 	s16x4 = vget_high_s16(s16x8);
130*fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
131*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
132*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
133*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
134*fc54766bSSrikanth Yalavarthi }
135*fc54766bSSrikanth Yalavarthi 
136*fc54766bSSrikanth Yalavarthi static inline void
137*fc54766bSSrikanth Yalavarthi __int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
138*fc54766bSSrikanth Yalavarthi {
139*fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
140*fc54766bSSrikanth Yalavarthi }
141*fc54766bSSrikanth Yalavarthi 
142*fc54766bSSrikanth Yalavarthi int
143*fc54766bSSrikanth Yalavarthi rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
144*fc54766bSSrikanth Yalavarthi {
145*fc54766bSSrikanth Yalavarthi 	int8_t *input_buffer;
146*fc54766bSSrikanth Yalavarthi 	float *output_buffer;
147*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
148*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
149*fc54766bSSrikanth Yalavarthi 	uint64_t i;
150*fc54766bSSrikanth Yalavarthi 
151*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
152*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
153*fc54766bSSrikanth Yalavarthi 
154*fc54766bSSrikanth Yalavarthi 	input_buffer = (int8_t *)input;
155*fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
156*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int8_t);
157*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
158*fc54766bSSrikanth Yalavarthi 
159*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
160*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
161*fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
162*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
163*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
164*fc54766bSSrikanth Yalavarthi 	}
165*fc54766bSSrikanth Yalavarthi 
166*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
167*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
168*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
169*fc54766bSSrikanth Yalavarthi 		__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
170*fc54766bSSrikanth Yalavarthi 		input_buffer++;
171*fc54766bSSrikanth Yalavarthi 		output_buffer++;
172*fc54766bSSrikanth Yalavarthi 	}
173*fc54766bSSrikanth Yalavarthi 
174*fc54766bSSrikanth Yalavarthi 	return 0;
175*fc54766bSSrikanth Yalavarthi }
176*fc54766bSSrikanth Yalavarthi 
177*fc54766bSSrikanth Yalavarthi static inline void
178*fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
179*fc54766bSSrikanth Yalavarthi {
180*fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_l;
181*fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4_h;
182*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
183*fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
184*fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
185*fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
186*fc54766bSSrikanth Yalavarthi 
187*fc54766bSSrikanth Yalavarthi 	/* load 4 float elements, scale, convert, saturate narrow to uint16_t.
188*fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
189*fc54766bSSrikanth Yalavarthi 	 */
190*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
191*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
192*fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
193*fc54766bSSrikanth Yalavarthi 	u16x4_l = vqmovn_u32(u32x4);
194*fc54766bSSrikanth Yalavarthi 
195*fc54766bSSrikanth Yalavarthi 	/* load next 4 float elements, scale, convert, saturate narrow to uint16_t
196*fc54766bSSrikanth Yalavarthi 	 * use round to nearest with ties away rounding mode.
197*fc54766bSSrikanth Yalavarthi 	 */
198*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input + 4);
199*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
200*fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
201*fc54766bSSrikanth Yalavarthi 	u16x4_h = vqmovn_u32(u32x4);
202*fc54766bSSrikanth Yalavarthi 
203*fc54766bSSrikanth Yalavarthi 	/* combine lower and higher uint16x4_t */
204*fc54766bSSrikanth Yalavarthi 	u16x8 = vcombine_u16(u16x4_l, u16x4_h);
205*fc54766bSSrikanth Yalavarthi 
206*fc54766bSSrikanth Yalavarthi 	/* narrow to uint8x8_t */
207*fc54766bSSrikanth Yalavarthi 	u8x8 = vqmovn_u16(u16x8);
208*fc54766bSSrikanth Yalavarthi 
209*fc54766bSSrikanth Yalavarthi 	/* store 8 elements */
210*fc54766bSSrikanth Yalavarthi 	vst1_u8(output, u8x8);
211*fc54766bSSrikanth Yalavarthi }
212*fc54766bSSrikanth Yalavarthi 
213*fc54766bSSrikanth Yalavarthi static inline void
214*fc54766bSSrikanth Yalavarthi __float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
215*fc54766bSSrikanth Yalavarthi {
216*fc54766bSSrikanth Yalavarthi 	uint32_t u32;
217*fc54766bSSrikanth Yalavarthi 	uint16_t u16;
218*fc54766bSSrikanth Yalavarthi 
219*fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
220*fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
221*fc54766bSSrikanth Yalavarthi 
222*fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
223*fc54766bSSrikanth Yalavarthi 	u16 = vqmovns_u32(u32);
224*fc54766bSSrikanth Yalavarthi 
225*fc54766bSSrikanth Yalavarthi 	/* convert to uint8_t */
226*fc54766bSSrikanth Yalavarthi 	*output = vqmovnh_u16(u16);
227*fc54766bSSrikanth Yalavarthi }
228*fc54766bSSrikanth Yalavarthi 
229*fc54766bSSrikanth Yalavarthi int
230*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
231*fc54766bSSrikanth Yalavarthi {
232*fc54766bSSrikanth Yalavarthi 	float *input_buffer;
233*fc54766bSSrikanth Yalavarthi 	uint8_t *output_buffer;
234*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
235*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
236*fc54766bSSrikanth Yalavarthi 	uint64_t i;
237*fc54766bSSrikanth Yalavarthi 
238*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
239*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
240*fc54766bSSrikanth Yalavarthi 
241*fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
242*fc54766bSSrikanth Yalavarthi 	output_buffer = (uint8_t *)output;
243*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
244*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
245*fc54766bSSrikanth Yalavarthi 
246*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
247*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
248*fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
249*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
250*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
251*fc54766bSSrikanth Yalavarthi 	}
252*fc54766bSSrikanth Yalavarthi 
253*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
254*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
255*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
256*fc54766bSSrikanth Yalavarthi 		__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
257*fc54766bSSrikanth Yalavarthi 		input_buffer++;
258*fc54766bSSrikanth Yalavarthi 		output_buffer++;
259*fc54766bSSrikanth Yalavarthi 	}
260*fc54766bSSrikanth Yalavarthi 
261*fc54766bSSrikanth Yalavarthi 	return 0;
262*fc54766bSSrikanth Yalavarthi }
263*fc54766bSSrikanth Yalavarthi 
264*fc54766bSSrikanth Yalavarthi static inline void
265*fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
266*fc54766bSSrikanth Yalavarthi {
267*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
268*fc54766bSSrikanth Yalavarthi 	uint16x8_t u16x8;
269*fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
270*fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
271*fc54766bSSrikanth Yalavarthi 	uint8x8_t u8x8;
272*fc54766bSSrikanth Yalavarthi 
273*fc54766bSSrikanth Yalavarthi 	/* load 8 x uint8_t elements */
274*fc54766bSSrikanth Yalavarthi 	u8x8 = vld1_u8(input);
275*fc54766bSSrikanth Yalavarthi 
276*fc54766bSSrikanth Yalavarthi 	/* widen uint8_t to uint16_t */
277*fc54766bSSrikanth Yalavarthi 	u16x8 = vmovl_u8(u8x8);
278*fc54766bSSrikanth Yalavarthi 
279*fc54766bSSrikanth Yalavarthi 	/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
280*fc54766bSSrikanth Yalavarthi 	u16x4 = vget_low_u16(u16x8);
281*fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
282*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
283*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
284*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
285*fc54766bSSrikanth Yalavarthi 
286*fc54766bSSrikanth Yalavarthi 	/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
287*fc54766bSSrikanth Yalavarthi 	u16x4 = vget_high_u16(u16x8);
288*fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
289*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
290*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
291*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output + 4, f32x4);
292*fc54766bSSrikanth Yalavarthi }
293*fc54766bSSrikanth Yalavarthi 
294*fc54766bSSrikanth Yalavarthi static inline void
295*fc54766bSSrikanth Yalavarthi __uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
296*fc54766bSSrikanth Yalavarthi {
297*fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
298*fc54766bSSrikanth Yalavarthi }
299*fc54766bSSrikanth Yalavarthi 
300*fc54766bSSrikanth Yalavarthi int
301*fc54766bSSrikanth Yalavarthi rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
302*fc54766bSSrikanth Yalavarthi {
303*fc54766bSSrikanth Yalavarthi 	uint8_t *input_buffer;
304*fc54766bSSrikanth Yalavarthi 	float *output_buffer;
305*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
306*fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
307*fc54766bSSrikanth Yalavarthi 	uint64_t i;
308*fc54766bSSrikanth Yalavarthi 
309*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
310*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
311*fc54766bSSrikanth Yalavarthi 
312*fc54766bSSrikanth Yalavarthi 	input_buffer = (uint8_t *)input;
313*fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
314*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint8_t);
315*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
316*fc54766bSSrikanth Yalavarthi 
317*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
318*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
319*fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
320*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
321*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
322*fc54766bSSrikanth Yalavarthi 	}
323*fc54766bSSrikanth Yalavarthi 
324*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
325*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
326*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
327*fc54766bSSrikanth Yalavarthi 		__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
328*fc54766bSSrikanth Yalavarthi 		input_buffer++;
329*fc54766bSSrikanth Yalavarthi 		output_buffer++;
330*fc54766bSSrikanth Yalavarthi 	}
331*fc54766bSSrikanth Yalavarthi 
332*fc54766bSSrikanth Yalavarthi 	return 0;
333*fc54766bSSrikanth Yalavarthi }
334*fc54766bSSrikanth Yalavarthi 
335*fc54766bSSrikanth Yalavarthi static inline void
336*fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
337*fc54766bSSrikanth Yalavarthi {
338*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
339*fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
340*fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
341*fc54766bSSrikanth Yalavarthi 
342*fc54766bSSrikanth Yalavarthi 	/* load 4 x float elements */
343*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
344*fc54766bSSrikanth Yalavarthi 
345*fc54766bSSrikanth Yalavarthi 	/* scale */
346*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
347*fc54766bSSrikanth Yalavarthi 
348*fc54766bSSrikanth Yalavarthi 	/* convert to int32x4_t using round to nearest with ties away rounding mode */
349*fc54766bSSrikanth Yalavarthi 	s32x4 = vcvtaq_s32_f32(f32x4);
350*fc54766bSSrikanth Yalavarthi 
351*fc54766bSSrikanth Yalavarthi 	/* saturate narrow to int16x4_t */
352*fc54766bSSrikanth Yalavarthi 	s16x4 = vqmovn_s32(s32x4);
353*fc54766bSSrikanth Yalavarthi 
354*fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
355*fc54766bSSrikanth Yalavarthi 	vst1_s16(output, s16x4);
356*fc54766bSSrikanth Yalavarthi }
357*fc54766bSSrikanth Yalavarthi 
358*fc54766bSSrikanth Yalavarthi static inline void
359*fc54766bSSrikanth Yalavarthi __float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
360*fc54766bSSrikanth Yalavarthi {
361*fc54766bSSrikanth Yalavarthi 	int32_t s32;
362*fc54766bSSrikanth Yalavarthi 
363*fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
364*fc54766bSSrikanth Yalavarthi 	s32 = vcvtas_s32_f32(scale * (*input));
365*fc54766bSSrikanth Yalavarthi 
366*fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
367*fc54766bSSrikanth Yalavarthi 	*output = vqmovns_s32(s32);
368*fc54766bSSrikanth Yalavarthi }
369*fc54766bSSrikanth Yalavarthi 
370*fc54766bSSrikanth Yalavarthi int
371*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
372*fc54766bSSrikanth Yalavarthi {
373*fc54766bSSrikanth Yalavarthi 	float *input_buffer;
374*fc54766bSSrikanth Yalavarthi 	int16_t *output_buffer;
375*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
376*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
377*fc54766bSSrikanth Yalavarthi 	uint64_t i;
378*fc54766bSSrikanth Yalavarthi 
379*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
380*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
381*fc54766bSSrikanth Yalavarthi 
382*fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
383*fc54766bSSrikanth Yalavarthi 	output_buffer = (int16_t *)output;
384*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
385*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
386*fc54766bSSrikanth Yalavarthi 
387*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
388*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
389*fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
390*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
391*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
392*fc54766bSSrikanth Yalavarthi 	}
393*fc54766bSSrikanth Yalavarthi 
394*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
395*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
396*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
397*fc54766bSSrikanth Yalavarthi 		__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
398*fc54766bSSrikanth Yalavarthi 		input_buffer++;
399*fc54766bSSrikanth Yalavarthi 		output_buffer++;
400*fc54766bSSrikanth Yalavarthi 	}
401*fc54766bSSrikanth Yalavarthi 
402*fc54766bSSrikanth Yalavarthi 	return 0;
403*fc54766bSSrikanth Yalavarthi }
404*fc54766bSSrikanth Yalavarthi 
405*fc54766bSSrikanth Yalavarthi static inline void
406*fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
407*fc54766bSSrikanth Yalavarthi {
408*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
409*fc54766bSSrikanth Yalavarthi 	int16x4_t s16x4;
410*fc54766bSSrikanth Yalavarthi 	int32x4_t s32x4;
411*fc54766bSSrikanth Yalavarthi 
412*fc54766bSSrikanth Yalavarthi 	/* load 4 x int16_t elements */
413*fc54766bSSrikanth Yalavarthi 	s16x4 = vld1_s16(input);
414*fc54766bSSrikanth Yalavarthi 
415*fc54766bSSrikanth Yalavarthi 	/* widen int16_t to int32_t */
416*fc54766bSSrikanth Yalavarthi 	s32x4 = vmovl_s16(s16x4);
417*fc54766bSSrikanth Yalavarthi 
418*fc54766bSSrikanth Yalavarthi 	/* convert int32_t to float */
419*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_s32(s32x4);
420*fc54766bSSrikanth Yalavarthi 
421*fc54766bSSrikanth Yalavarthi 	/* scale */
422*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
423*fc54766bSSrikanth Yalavarthi 
424*fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
425*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
426*fc54766bSSrikanth Yalavarthi }
427*fc54766bSSrikanth Yalavarthi 
428*fc54766bSSrikanth Yalavarthi static inline void
429*fc54766bSSrikanth Yalavarthi __int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
430*fc54766bSSrikanth Yalavarthi {
431*fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_s32((int32_t)*input);
432*fc54766bSSrikanth Yalavarthi }
433*fc54766bSSrikanth Yalavarthi 
434*fc54766bSSrikanth Yalavarthi int
435*fc54766bSSrikanth Yalavarthi rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
436*fc54766bSSrikanth Yalavarthi {
437*fc54766bSSrikanth Yalavarthi 	int16_t *input_buffer;
438*fc54766bSSrikanth Yalavarthi 	float *output_buffer;
439*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
440*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
441*fc54766bSSrikanth Yalavarthi 	uint64_t i;
442*fc54766bSSrikanth Yalavarthi 
443*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
444*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
445*fc54766bSSrikanth Yalavarthi 
446*fc54766bSSrikanth Yalavarthi 	input_buffer = (int16_t *)input;
447*fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
448*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(int16_t);
449*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
450*fc54766bSSrikanth Yalavarthi 
451*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
452*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
453*fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
454*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
455*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
456*fc54766bSSrikanth Yalavarthi 	}
457*fc54766bSSrikanth Yalavarthi 
458*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
459*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
460*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
461*fc54766bSSrikanth Yalavarthi 		__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
462*fc54766bSSrikanth Yalavarthi 		input_buffer++;
463*fc54766bSSrikanth Yalavarthi 		output_buffer++;
464*fc54766bSSrikanth Yalavarthi 	}
465*fc54766bSSrikanth Yalavarthi 
466*fc54766bSSrikanth Yalavarthi 	return 0;
467*fc54766bSSrikanth Yalavarthi }
468*fc54766bSSrikanth Yalavarthi 
469*fc54766bSSrikanth Yalavarthi static inline void
470*fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
471*fc54766bSSrikanth Yalavarthi {
472*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
473*fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
474*fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
475*fc54766bSSrikanth Yalavarthi 
476*fc54766bSSrikanth Yalavarthi 	/* load 4 float elements */
477*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
478*fc54766bSSrikanth Yalavarthi 
479*fc54766bSSrikanth Yalavarthi 	/* scale */
480*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
481*fc54766bSSrikanth Yalavarthi 
482*fc54766bSSrikanth Yalavarthi 	/* convert using round to nearest with ties to away rounding mode */
483*fc54766bSSrikanth Yalavarthi 	u32x4 = vcvtaq_u32_f32(f32x4);
484*fc54766bSSrikanth Yalavarthi 
485*fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
486*fc54766bSSrikanth Yalavarthi 	u16x4 = vqmovn_u32(u32x4);
487*fc54766bSSrikanth Yalavarthi 
488*fc54766bSSrikanth Yalavarthi 	/* store 4 elements */
489*fc54766bSSrikanth Yalavarthi 	vst1_u16(output, u16x4);
490*fc54766bSSrikanth Yalavarthi }
491*fc54766bSSrikanth Yalavarthi 
492*fc54766bSSrikanth Yalavarthi static inline void
493*fc54766bSSrikanth Yalavarthi __float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
494*fc54766bSSrikanth Yalavarthi {
495*fc54766bSSrikanth Yalavarthi 	uint32_t u32;
496*fc54766bSSrikanth Yalavarthi 
497*fc54766bSSrikanth Yalavarthi 	/* scale and convert, round to nearest with ties away rounding mode */
498*fc54766bSSrikanth Yalavarthi 	u32 = vcvtas_u32_f32(scale * (*input));
499*fc54766bSSrikanth Yalavarthi 
500*fc54766bSSrikanth Yalavarthi 	/* saturate narrow */
501*fc54766bSSrikanth Yalavarthi 	*output = vqmovns_u32(u32);
502*fc54766bSSrikanth Yalavarthi }
503*fc54766bSSrikanth Yalavarthi 
504*fc54766bSSrikanth Yalavarthi int
505*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
506*fc54766bSSrikanth Yalavarthi {
507*fc54766bSSrikanth Yalavarthi 	float *input_buffer;
508*fc54766bSSrikanth Yalavarthi 	uint16_t *output_buffer;
509*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
510*fc54766bSSrikanth Yalavarthi 	uint64_t vlen;
511*fc54766bSSrikanth Yalavarthi 	uint64_t i;
512*fc54766bSSrikanth Yalavarthi 
513*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
514*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
515*fc54766bSSrikanth Yalavarthi 
516*fc54766bSSrikanth Yalavarthi 	input_buffer = (float *)input;
517*fc54766bSSrikanth Yalavarthi 	output_buffer = (uint16_t *)output;
518*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
519*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
520*fc54766bSSrikanth Yalavarthi 
521*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
522*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
523*fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
524*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
525*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
526*fc54766bSSrikanth Yalavarthi 	}
527*fc54766bSSrikanth Yalavarthi 
528*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
529*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
530*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
531*fc54766bSSrikanth Yalavarthi 		__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
532*fc54766bSSrikanth Yalavarthi 		input_buffer++;
533*fc54766bSSrikanth Yalavarthi 		output_buffer++;
534*fc54766bSSrikanth Yalavarthi 	}
535*fc54766bSSrikanth Yalavarthi 
536*fc54766bSSrikanth Yalavarthi 	return 0;
537*fc54766bSSrikanth Yalavarthi }
538*fc54766bSSrikanth Yalavarthi 
539*fc54766bSSrikanth Yalavarthi static inline void
540*fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
541*fc54766bSSrikanth Yalavarthi {
542*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
543*fc54766bSSrikanth Yalavarthi 	uint16x4_t u16x4;
544*fc54766bSSrikanth Yalavarthi 	uint32x4_t u32x4;
545*fc54766bSSrikanth Yalavarthi 
546*fc54766bSSrikanth Yalavarthi 	/* load 4 x uint16_t elements */
547*fc54766bSSrikanth Yalavarthi 	u16x4 = vld1_u16(input);
548*fc54766bSSrikanth Yalavarthi 
549*fc54766bSSrikanth Yalavarthi 	/* widen uint16_t to uint32_t */
550*fc54766bSSrikanth Yalavarthi 	u32x4 = vmovl_u16(u16x4);
551*fc54766bSSrikanth Yalavarthi 
552*fc54766bSSrikanth Yalavarthi 	/* convert uint32_t to float */
553*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvtq_f32_u32(u32x4);
554*fc54766bSSrikanth Yalavarthi 
555*fc54766bSSrikanth Yalavarthi 	/* scale */
556*fc54766bSSrikanth Yalavarthi 	f32x4 = vmulq_n_f32(f32x4, scale);
557*fc54766bSSrikanth Yalavarthi 
558*fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
559*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
560*fc54766bSSrikanth Yalavarthi }
561*fc54766bSSrikanth Yalavarthi 
562*fc54766bSSrikanth Yalavarthi static inline void
563*fc54766bSSrikanth Yalavarthi __uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
564*fc54766bSSrikanth Yalavarthi {
565*fc54766bSSrikanth Yalavarthi 	*output = scale * vcvts_f32_u32((uint32_t)*input);
566*fc54766bSSrikanth Yalavarthi }
567*fc54766bSSrikanth Yalavarthi 
568*fc54766bSSrikanth Yalavarthi int
569*fc54766bSSrikanth Yalavarthi rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
570*fc54766bSSrikanth Yalavarthi {
571*fc54766bSSrikanth Yalavarthi 	uint16_t *input_buffer;
572*fc54766bSSrikanth Yalavarthi 	float *output_buffer;
573*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
574*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
575*fc54766bSSrikanth Yalavarthi 	uint64_t i;
576*fc54766bSSrikanth Yalavarthi 
577*fc54766bSSrikanth Yalavarthi 	if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
578*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
579*fc54766bSSrikanth Yalavarthi 
580*fc54766bSSrikanth Yalavarthi 	input_buffer = (uint16_t *)input;
581*fc54766bSSrikanth Yalavarthi 	output_buffer = (float *)output;
582*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float) / sizeof(uint16_t);
583*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
584*fc54766bSSrikanth Yalavarthi 
585*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
586*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
587*fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
588*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
589*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
590*fc54766bSSrikanth Yalavarthi 	}
591*fc54766bSSrikanth Yalavarthi 
592*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
593*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
594*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
595*fc54766bSSrikanth Yalavarthi 		__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
596*fc54766bSSrikanth Yalavarthi 		input_buffer++;
597*fc54766bSSrikanth Yalavarthi 		output_buffer++;
598*fc54766bSSrikanth Yalavarthi 	}
599*fc54766bSSrikanth Yalavarthi 
600*fc54766bSSrikanth Yalavarthi 	return 0;
601*fc54766bSSrikanth Yalavarthi }
602*fc54766bSSrikanth Yalavarthi 
603*fc54766bSSrikanth Yalavarthi static inline void
604*fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
605*fc54766bSSrikanth Yalavarthi {
606*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
607*fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
608*fc54766bSSrikanth Yalavarthi 
609*fc54766bSSrikanth Yalavarthi 	/* load 4 x float32_t elements */
610*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
611*fc54766bSSrikanth Yalavarthi 
612*fc54766bSSrikanth Yalavarthi 	/* convert to float16x4_t */
613*fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
614*fc54766bSSrikanth Yalavarthi 
615*fc54766bSSrikanth Yalavarthi 	/* store float16x4_t */
616*fc54766bSSrikanth Yalavarthi 	vst1_f16(output, f16x4);
617*fc54766bSSrikanth Yalavarthi }
618*fc54766bSSrikanth Yalavarthi 
619*fc54766bSSrikanth Yalavarthi static inline void
620*fc54766bSSrikanth Yalavarthi __float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
621*fc54766bSSrikanth Yalavarthi {
622*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
623*fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
624*fc54766bSSrikanth Yalavarthi 
625*fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
626*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_dup_f32(input);
627*fc54766bSSrikanth Yalavarthi 
628*fc54766bSSrikanth Yalavarthi 	/* convert float32_t to float16_t */
629*fc54766bSSrikanth Yalavarthi 	f16x4 = vcvt_f16_f32(f32x4);
630*fc54766bSSrikanth Yalavarthi 
631*fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
632*fc54766bSSrikanth Yalavarthi 	vst1_lane_f16(output, f16x4, 0);
633*fc54766bSSrikanth Yalavarthi }
634*fc54766bSSrikanth Yalavarthi 
635*fc54766bSSrikanth Yalavarthi int
636*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
637*fc54766bSSrikanth Yalavarthi {
638*fc54766bSSrikanth Yalavarthi 	float32_t *input_buffer;
639*fc54766bSSrikanth Yalavarthi 	float16_t *output_buffer;
640*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
641*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
642*fc54766bSSrikanth Yalavarthi 	uint64_t i;
643*fc54766bSSrikanth Yalavarthi 
644*fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
645*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
646*fc54766bSSrikanth Yalavarthi 
647*fc54766bSSrikanth Yalavarthi 	input_buffer = (float32_t *)input;
648*fc54766bSSrikanth Yalavarthi 	output_buffer = (float16_t *)output;
649*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
650*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
651*fc54766bSSrikanth Yalavarthi 
652*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
653*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
654*fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x4(input_buffer, output_buffer);
655*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
656*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
657*fc54766bSSrikanth Yalavarthi 	}
658*fc54766bSSrikanth Yalavarthi 
659*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
660*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
661*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
662*fc54766bSSrikanth Yalavarthi 		__float32_to_float16_neon_f16x1(input_buffer, output_buffer);
663*fc54766bSSrikanth Yalavarthi 		input_buffer++;
664*fc54766bSSrikanth Yalavarthi 		output_buffer++;
665*fc54766bSSrikanth Yalavarthi 	}
666*fc54766bSSrikanth Yalavarthi 
667*fc54766bSSrikanth Yalavarthi 	return 0;
668*fc54766bSSrikanth Yalavarthi }
669*fc54766bSSrikanth Yalavarthi 
670*fc54766bSSrikanth Yalavarthi static inline void
671*fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
672*fc54766bSSrikanth Yalavarthi {
673*fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
674*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
675*fc54766bSSrikanth Yalavarthi 
676*fc54766bSSrikanth Yalavarthi 	/* load 4 x float16_t elements */
677*fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_f16(input);
678*fc54766bSSrikanth Yalavarthi 
679*fc54766bSSrikanth Yalavarthi 	/* convert float16x4_t to float32x4_t */
680*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
681*fc54766bSSrikanth Yalavarthi 
682*fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
683*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
684*fc54766bSSrikanth Yalavarthi }
685*fc54766bSSrikanth Yalavarthi 
686*fc54766bSSrikanth Yalavarthi static inline void
687*fc54766bSSrikanth Yalavarthi __float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
688*fc54766bSSrikanth Yalavarthi {
689*fc54766bSSrikanth Yalavarthi 	float16x4_t f16x4;
690*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
691*fc54766bSSrikanth Yalavarthi 
692*fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
693*fc54766bSSrikanth Yalavarthi 	f16x4 = vld1_dup_f16(input);
694*fc54766bSSrikanth Yalavarthi 
695*fc54766bSSrikanth Yalavarthi 	/* convert float16_t to float32_t */
696*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_f16(f16x4);
697*fc54766bSSrikanth Yalavarthi 
698*fc54766bSSrikanth Yalavarthi 	/* store 1 element */
699*fc54766bSSrikanth Yalavarthi 	vst1q_lane_f32(output, f32x4, 0);
700*fc54766bSSrikanth Yalavarthi }
701*fc54766bSSrikanth Yalavarthi 
702*fc54766bSSrikanth Yalavarthi int
703*fc54766bSSrikanth Yalavarthi rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
704*fc54766bSSrikanth Yalavarthi {
705*fc54766bSSrikanth Yalavarthi 	float16_t *input_buffer;
706*fc54766bSSrikanth Yalavarthi 	float32_t *output_buffer;
707*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
708*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
709*fc54766bSSrikanth Yalavarthi 	uint64_t i;
710*fc54766bSSrikanth Yalavarthi 
711*fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
712*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
713*fc54766bSSrikanth Yalavarthi 
714*fc54766bSSrikanth Yalavarthi 	input_buffer = (float16_t *)input;
715*fc54766bSSrikanth Yalavarthi 	output_buffer = (float32_t *)output;
716*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
717*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
718*fc54766bSSrikanth Yalavarthi 
719*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
720*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
721*fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x4(input_buffer, output_buffer);
722*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
723*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
724*fc54766bSSrikanth Yalavarthi 	}
725*fc54766bSSrikanth Yalavarthi 
726*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
727*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
728*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
729*fc54766bSSrikanth Yalavarthi 		__float16_to_float32_neon_f32x1(input_buffer, output_buffer);
730*fc54766bSSrikanth Yalavarthi 		input_buffer++;
731*fc54766bSSrikanth Yalavarthi 		output_buffer++;
732*fc54766bSSrikanth Yalavarthi 	}
733*fc54766bSSrikanth Yalavarthi 
734*fc54766bSSrikanth Yalavarthi 	return 0;
735*fc54766bSSrikanth Yalavarthi }
736*fc54766bSSrikanth Yalavarthi 
737*fc54766bSSrikanth Yalavarthi #ifdef __ARM_FEATURE_BF16
738*fc54766bSSrikanth Yalavarthi 
739*fc54766bSSrikanth Yalavarthi static inline void
740*fc54766bSSrikanth Yalavarthi __float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
741*fc54766bSSrikanth Yalavarthi {
742*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
743*fc54766bSSrikanth Yalavarthi 	bfloat16x4_t bf16x4;
744*fc54766bSSrikanth Yalavarthi 
745*fc54766bSSrikanth Yalavarthi 	/* load 4 x float32_t elements */
746*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_f32(input);
747*fc54766bSSrikanth Yalavarthi 
748*fc54766bSSrikanth Yalavarthi 	/* convert float32x4_t to bfloat16x4_t */
749*fc54766bSSrikanth Yalavarthi 	bf16x4 = vcvt_bf16_f32(f32x4);
750*fc54766bSSrikanth Yalavarthi 
751*fc54766bSSrikanth Yalavarthi 	/* store bfloat16x4_t */
752*fc54766bSSrikanth Yalavarthi 	vst1_bf16(output, bf16x4);
753*fc54766bSSrikanth Yalavarthi }
754*fc54766bSSrikanth Yalavarthi 
755*fc54766bSSrikanth Yalavarthi static inline void
756*fc54766bSSrikanth Yalavarthi __float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
757*fc54766bSSrikanth Yalavarthi {
758*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
759*fc54766bSSrikanth Yalavarthi 	bfloat16x4_t bf16x4;
760*fc54766bSSrikanth Yalavarthi 
761*fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
762*fc54766bSSrikanth Yalavarthi 	f32x4 = vld1q_dup_f32(input);
763*fc54766bSSrikanth Yalavarthi 
764*fc54766bSSrikanth Yalavarthi 	/* convert float32_t to bfloat16_t */
765*fc54766bSSrikanth Yalavarthi 	bf16x4 = vcvt_bf16_f32(f32x4);
766*fc54766bSSrikanth Yalavarthi 
767*fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
768*fc54766bSSrikanth Yalavarthi 	vst1_lane_bf16(output, bf16x4, 0);
769*fc54766bSSrikanth Yalavarthi }
770*fc54766bSSrikanth Yalavarthi 
771*fc54766bSSrikanth Yalavarthi int
772*fc54766bSSrikanth Yalavarthi rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
773*fc54766bSSrikanth Yalavarthi {
774*fc54766bSSrikanth Yalavarthi 	float32_t *input_buffer;
775*fc54766bSSrikanth Yalavarthi 	bfloat16_t *output_buffer;
776*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
777*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
778*fc54766bSSrikanth Yalavarthi 	uint64_t i;
779*fc54766bSSrikanth Yalavarthi 
780*fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
781*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
782*fc54766bSSrikanth Yalavarthi 
783*fc54766bSSrikanth Yalavarthi 	input_buffer = (float32_t *)input;
784*fc54766bSSrikanth Yalavarthi 	output_buffer = (bfloat16_t *)output;
785*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
786*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
787*fc54766bSSrikanth Yalavarthi 
788*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
789*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
790*fc54766bSSrikanth Yalavarthi 		__float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);
791*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
792*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
793*fc54766bSSrikanth Yalavarthi 	}
794*fc54766bSSrikanth Yalavarthi 
795*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
796*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
797*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
798*fc54766bSSrikanth Yalavarthi 		__float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);
799*fc54766bSSrikanth Yalavarthi 		input_buffer++;
800*fc54766bSSrikanth Yalavarthi 		output_buffer++;
801*fc54766bSSrikanth Yalavarthi 	}
802*fc54766bSSrikanth Yalavarthi 
803*fc54766bSSrikanth Yalavarthi 	return 0;
804*fc54766bSSrikanth Yalavarthi }
805*fc54766bSSrikanth Yalavarthi 
806*fc54766bSSrikanth Yalavarthi static inline void
807*fc54766bSSrikanth Yalavarthi __bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
808*fc54766bSSrikanth Yalavarthi {
809*fc54766bSSrikanth Yalavarthi 	bfloat16x4_t bf16x4;
810*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
811*fc54766bSSrikanth Yalavarthi 
812*fc54766bSSrikanth Yalavarthi 	/* load 4 x bfloat16_t elements */
813*fc54766bSSrikanth Yalavarthi 	bf16x4 = vld1_bf16(input);
814*fc54766bSSrikanth Yalavarthi 
815*fc54766bSSrikanth Yalavarthi 	/* convert bfloat16x4_t to float32x4_t */
816*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_bf16(bf16x4);
817*fc54766bSSrikanth Yalavarthi 
818*fc54766bSSrikanth Yalavarthi 	/* store float32x4_t */
819*fc54766bSSrikanth Yalavarthi 	vst1q_f32(output, f32x4);
820*fc54766bSSrikanth Yalavarthi }
821*fc54766bSSrikanth Yalavarthi 
822*fc54766bSSrikanth Yalavarthi static inline void
823*fc54766bSSrikanth Yalavarthi __bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
824*fc54766bSSrikanth Yalavarthi {
825*fc54766bSSrikanth Yalavarthi 	bfloat16x4_t bf16x4;
826*fc54766bSSrikanth Yalavarthi 	float32x4_t f32x4;
827*fc54766bSSrikanth Yalavarthi 
828*fc54766bSSrikanth Yalavarthi 	/* load element to 4 lanes */
829*fc54766bSSrikanth Yalavarthi 	bf16x4 = vld1_dup_bf16(input);
830*fc54766bSSrikanth Yalavarthi 
831*fc54766bSSrikanth Yalavarthi 	/* convert bfloat16_t to float32_t */
832*fc54766bSSrikanth Yalavarthi 	f32x4 = vcvt_f32_bf16(bf16x4);
833*fc54766bSSrikanth Yalavarthi 
834*fc54766bSSrikanth Yalavarthi 	/* store lane 0 / 1 element */
835*fc54766bSSrikanth Yalavarthi 	vst1q_lane_f32(output, f32x4, 0);
836*fc54766bSSrikanth Yalavarthi }
837*fc54766bSSrikanth Yalavarthi 
838*fc54766bSSrikanth Yalavarthi int
839*fc54766bSSrikanth Yalavarthi rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
840*fc54766bSSrikanth Yalavarthi {
841*fc54766bSSrikanth Yalavarthi 	bfloat16_t *input_buffer;
842*fc54766bSSrikanth Yalavarthi 	float32_t *output_buffer;
843*fc54766bSSrikanth Yalavarthi 	uint64_t nb_iterations;
844*fc54766bSSrikanth Yalavarthi 	uint32_t vlen;
845*fc54766bSSrikanth Yalavarthi 	uint64_t i;
846*fc54766bSSrikanth Yalavarthi 
847*fc54766bSSrikanth Yalavarthi 	if ((nb_elements == 0) || (input == NULL) || (output == NULL))
848*fc54766bSSrikanth Yalavarthi 		return -EINVAL;
849*fc54766bSSrikanth Yalavarthi 
850*fc54766bSSrikanth Yalavarthi 	input_buffer = (bfloat16_t *)input;
851*fc54766bSSrikanth Yalavarthi 	output_buffer = (float32_t *)output;
852*fc54766bSSrikanth Yalavarthi 	vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
853*fc54766bSSrikanth Yalavarthi 	nb_iterations = nb_elements / vlen;
854*fc54766bSSrikanth Yalavarthi 
855*fc54766bSSrikanth Yalavarthi 	/* convert vlen elements in each iteration */
856*fc54766bSSrikanth Yalavarthi 	for (i = 0; i < nb_iterations; i++) {
857*fc54766bSSrikanth Yalavarthi 		__bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);
858*fc54766bSSrikanth Yalavarthi 		input_buffer += vlen;
859*fc54766bSSrikanth Yalavarthi 		output_buffer += vlen;
860*fc54766bSSrikanth Yalavarthi 	}
861*fc54766bSSrikanth Yalavarthi 
862*fc54766bSSrikanth Yalavarthi 	/* convert leftover elements */
863*fc54766bSSrikanth Yalavarthi 	i = i * vlen;
864*fc54766bSSrikanth Yalavarthi 	for (; i < nb_elements; i++) {
865*fc54766bSSrikanth Yalavarthi 		__bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);
866*fc54766bSSrikanth Yalavarthi 		input_buffer++;
867*fc54766bSSrikanth Yalavarthi 		output_buffer++;
868*fc54766bSSrikanth Yalavarthi 	}
869*fc54766bSSrikanth Yalavarthi 
870*fc54766bSSrikanth Yalavarthi 	return 0;
871*fc54766bSSrikanth Yalavarthi }
872*fc54766bSSrikanth Yalavarthi 
873*fc54766bSSrikanth Yalavarthi #endif /* __ARM_FEATURE_BF16 */
874