xref: /spdk/lib/util/base64_sve.c (revision cdb0726b95631d46eaf4f2e39ddb6533f150fd27)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright(c) ARM Limited. 2021 All rights reserved.
3  */
4 
5 #ifndef __aarch64__
6 #error Unsupported hardware
7 #endif
8 
9 #include "spdk/stdinc.h"
10 #include <arm_sve.h>
11 
12 static int
13 table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
14 		  svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7,
15 		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
16 {
17 	svuint8_t res2, res3, res4, res5, res6, res7;
18 
19 	/*
20 	 * In base64 decode table, the first 32 elements are invalid value,
21 	 * so skip tbl_vec0 and tbl_vec1
22 	 */
23 	indices = svsub_n_u8_z(p8_in, indices, 2 * vl);
24 	res2 = svtbl_u8(tbl_vec2, indices);
25 	indices = svsub_n_u8_z(p8_in, indices, vl);
26 	res3 = svtbl_u8(tbl_vec3, indices);
27 	indices = svsub_n_u8_z(p8_in, indices, vl);
28 	res4 = svtbl_u8(tbl_vec4, indices);
29 	indices = svsub_n_u8_z(p8_in, indices, vl);
30 	res5 = svtbl_u8(tbl_vec5, indices);
31 	indices = svsub_n_u8_z(p8_in, indices, vl);
32 	res6 = svtbl_u8(tbl_vec6, indices);
33 	indices = svsub_n_u8_z(p8_in, indices, vl);
34 	res7 = svtbl_u8(tbl_vec7, indices);
35 
36 	*output = svdup_n_u8(0);
37 	*output = svadd_u8_z(p8_in, res2, *output);
38 	*output = svadd_u8_z(p8_in, res3, *output);
39 	*output = svadd_u8_z(p8_in, res4, *output);
40 	*output = svadd_u8_z(p8_in, res5, *output);
41 	*output = svadd_u8_z(p8_in, res6, *output);
42 	*output = svadd_u8_z(p8_in, res7, *output);
43 
44 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
45 		return -1;
46 	}
47 
48 	return 0;
49 }
50 
51 static int
52 table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
53 		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
54 {
55 	svuint8_t res0, res1, res2, res3;
56 
57 	res0 = svtbl_u8(tbl_vec0, indices);
58 	indices = svsub_n_u8_z(p8_in, indices, vl);
59 	res1 = svtbl_u8(tbl_vec1, indices);
60 	indices = svsub_n_u8_z(p8_in, indices, vl);
61 	res2 = svtbl_u8(tbl_vec2, indices);
62 	indices = svsub_n_u8_z(p8_in, indices, vl);
63 	res3 = svtbl_u8(tbl_vec3, indices);
64 
65 	*output = svdup_n_u8(0);
66 
67 	*output = svadd_u8_z(p8_in, res0, *output);
68 	*output = svadd_u8_z(p8_in, res1, *output);
69 	*output = svadd_u8_z(p8_in, res2, *output);
70 	*output = svadd_u8_z(p8_in, res3, *output);
71 
72 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
73 		return -1;
74 	}
75 
76 	return 0;
77 }
78 
79 static int
80 table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices,
81 		  svuint8_t *output, svbool_t p8_in, uint64_t vl)
82 {
83 	svuint8_t res0, res1, res2;
84 
85 	res0 = svtbl_u8(tbl_vec0, indices);
86 	indices = svsub_n_u8_z(p8_in, indices, vl);
87 	res1 = svtbl_u8(tbl_vec1, indices);
88 	indices = svsub_n_u8_z(p8_in, indices, vl);
89 	res2 = svtbl_u8(tbl_vec2, indices);
90 
91 	*output = svdup_n_u8(0);
92 
93 	*output = svadd_u8_z(p8_in, res0, *output);
94 	*output = svadd_u8_z(p8_in, res1, *output);
95 	*output = svadd_u8_z(p8_in, res2, *output);
96 
97 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
98 		return -1;
99 	}
100 
101 	return 0;
102 }
103 
104 static int
105 table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output,
106 		  svbool_t p8_in, uint64_t vl)
107 {
108 	svuint8_t res0, res1;
109 
110 	res0 = svtbl_u8(tbl_vec0, indices);
111 	indices = svsub_n_u8_z(p8_in, indices, vl);
112 	res1 = svtbl_u8(tbl_vec1, indices);
113 
114 	*output = svdup_n_u8(0);
115 
116 	*output = svadd_u8_z(p8_in, res0, *output);
117 	*output = svadd_u8_z(p8_in, res1, *output);
118 
119 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
120 		return -1;
121 	}
122 
123 	return 0;
124 }
125 
126 static inline void
127 convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1,
128 		       svuint8_t *temp2, svuint8_t *temp3)
129 {
130 	svuint8_t str0, str1, str2;
131 	svuint8x3_t ld_enc_input;
132 
133 	ld_enc_input = svld3_u8(pred, src);
134 
135 	str0 = svget3_u8(ld_enc_input, 0);
136 	str1 = svget3_u8(ld_enc_input, 1);
137 	str2 = svget3_u8(ld_enc_input, 2);
138 
139 
140 	*temp0 = svlsr_n_u8_z(pred, str0, 2);
141 	*temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0,
142 					     4)),
143 			    svdup_u8(0x3F));
144 	*temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1,
145 					     2)),
146 			    svdup_u8(0x3F));
147 	*temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F));
148 }
149 
150 static inline void
151 convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2,
152 		       svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2)
153 {
154 	*output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4));
155 	*output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2));
156 	*output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3);
157 }
158 
159 static void
160 base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len)
161 {
162 	uint64_t vl = svcntb();
163 	svuint8_t temp0, temp1, temp2, temp3;
164 	svuint8_t output0, output1, output2, output3;
165 	svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3;
166 	svuint8x4_t st_enc_output;
167 	svbool_t p8_all = svptrue_b8();
168 	svbool_t pred;
169 	uint64_t i = 0;
170 	uint64_t pred_count = 0;
171 	uint64_t N = (*src_len / 3) * 3;
172 
173 	if (vl == 16) {
174 
175 		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
176 		tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16);
177 		tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32);
178 		tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48);
179 
180 		while (i < N) {
181 			pred = svwhilelt_b8(i / 3, N / 3);
182 
183 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
184 
185 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl);
186 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl);
187 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl);
188 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl);
189 
190 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
191 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
192 
193 			pred_count = svcntp_b8(pred, pred);
194 			*src += pred_count * 3;
195 			*dst += pred_count * 4;
196 			*src_len -= pred_count * 3;
197 			i += pred_count * 3;
198 
199 		}
200 	} else if (vl == 32 || vl == 48) {
201 
202 		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
203 		pred = svwhilelt_b8(vl, (uint64_t)64);
204 		tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl);
205 
206 		while (i < N) {
207 			pred = svwhilelt_b8(i / 3, N / 3);
208 
209 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
210 
211 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl);
212 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl);
213 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl);
214 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl);
215 
216 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
217 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
218 
219 			pred_count = svcntp_b8(pred, pred);
220 			*src += pred_count * 3;
221 			*dst += pred_count * 4;
222 			*src_len -= pred_count * 3;
223 			i += pred_count * 3;
224 
225 		}
226 	} else if (vl >= 64) {
227 
228 		pred = svwhilelt_b8((uint64_t)0, (uint64_t)64);
229 		tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table);
230 
231 		while (i < N) {
232 			pred = svwhilelt_b8(i / 3, N / 3);
233 
234 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
235 
236 			output0 = svtbl_u8(tbl_enc0, temp0);
237 			output1 = svtbl_u8(tbl_enc0, temp1);
238 			output2 = svtbl_u8(tbl_enc0, temp2);
239 			output3 = svtbl_u8(tbl_enc0, temp3);
240 
241 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
242 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
243 
244 			pred_count = svcntp_b8(pred, pred);
245 			*src += pred_count * 3;
246 			*dst += pred_count * 4;
247 			*src_len -= pred_count * 3;
248 			i += pred_count * 3;
249 
250 		}
251 	}
252 }
253 
254 static void
255 base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len)
256 {
257 	uint64_t vl = svcntb();
258 	svuint8_t str0, str1, str2, str3;
259 	svuint8_t temp0, temp1, temp2, temp3;
260 	svuint8_t output0, output1, output2;
261 	svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7;
262 	svuint8x3_t st_dec_output;
263 	svbool_t p8_all = svptrue_b8();
264 	svbool_t pred;
265 	uint64_t i = 0;
266 	uint64_t pred_count = 0;
267 	uint64_t N = (*src_len / 4) * 4;
268 	svuint8x4_t ld_dec_input;
269 
270 	if (vl == 16) {
271 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
272 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16);
273 		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32);
274 		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48);
275 		tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64);
276 		tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80);
277 		tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96);
278 		tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112);
279 
280 		while (i < N) {
281 			pred = svwhilelt_b8(i / 4, N / 4);
282 
283 			ld_dec_input = svld4_u8(pred, *src);
284 
285 			str0 = svget4_u8(ld_dec_input, 0);
286 			str1 = svget4_u8(ld_dec_input, 1);
287 			str2 = svget4_u8(ld_dec_input, 2);
288 			str3 = svget4_u8(ld_dec_input, 3);
289 
290 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
291 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
292 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
293 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
294 
295 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
296 					      tbl_dec7, str0, &temp0, pred, vl)) { return; }
297 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
298 					      tbl_dec7, str1, &temp1, pred, vl)) { return; }
299 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
300 					      tbl_dec7, str2, &temp2, pred, vl)) { return; }
301 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
302 					      tbl_dec7, str3, &temp3, pred, vl)) { return; }
303 
304 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
305 
306 			st_dec_output = svcreate3_u8(output0, output1, output2);
307 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
308 
309 			pred_count = svcntp_b8(pred, pred);
310 			*src += pred_count * 4;
311 			*dst += pred_count * 3;
312 			*src_len -= pred_count * 4;
313 			i += pred_count * 4;
314 
315 		}
316 	} else if (vl == 32) {
317 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
318 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
319 		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2);
320 		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3);
321 
322 		while (i < N) {
323 			pred = svwhilelt_b8(i / 4, N / 4);
324 
325 			ld_dec_input = svld4_u8(pred, *src);
326 
327 			str0 = svget4_u8(ld_dec_input, 0);
328 			str1 = svget4_u8(ld_dec_input, 1);
329 			str2 = svget4_u8(ld_dec_input, 2);
330 			str3 = svget4_u8(ld_dec_input, 3);
331 
332 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
333 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
334 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
335 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
336 
337 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; }
338 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; }
339 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; }
340 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; }
341 
342 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
343 
344 			st_dec_output = svcreate3_u8(output0, output1, output2);
345 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
346 
347 			pred_count = svcntp_b8(pred, pred);
348 			*src += pred_count * 4;
349 			*dst += pred_count * 3;
350 			*src_len -= pred_count * 4;
351 			i += pred_count * 4;
352 
353 		}
354 
355 	} else if (vl == 48) {
356 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
357 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
358 		pred = svwhilelt_b8(vl * 2, (uint64_t)128);
359 		tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl);
360 
361 		while (i < N) {
362 			pred = svwhilelt_b8(i / 4, N / 4);
363 
364 			ld_dec_input = svld4_u8(pred, *src);
365 
366 			str0 = svget4_u8(ld_dec_input, 0);
367 			str1 = svget4_u8(ld_dec_input, 1);
368 			str2 = svget4_u8(ld_dec_input, 2);
369 			str3 = svget4_u8(ld_dec_input, 3);
370 
371 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
372 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
373 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
374 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
375 
376 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; }
377 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; }
378 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; }
379 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; }
380 
381 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
382 
383 			st_dec_output = svcreate3_u8(output0, output1, output2);
384 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
385 
386 			pred_count = svcntp_b8(pred, pred);
387 			*src += pred_count * 4;
388 			*dst += pred_count * 3;
389 			*src_len -= pred_count * 4;
390 			i += pred_count * 4;
391 
392 		}
393 	} else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) {
394 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
395 		pred = svwhilelt_b8(vl, (uint64_t)128);
396 		tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl);
397 
398 		while (i < N) {
399 			pred = svwhilelt_b8(i / 4, N / 4);
400 
401 			ld_dec_input = svld4_u8(pred, *src);
402 
403 			str0 = svget4_u8(ld_dec_input, 0);
404 			str1 = svget4_u8(ld_dec_input, 1);
405 			str2 = svget4_u8(ld_dec_input, 2);
406 			str3 = svget4_u8(ld_dec_input, 3);
407 
408 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
409 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
410 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
411 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
412 
413 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; }
414 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; }
415 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; }
416 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; }
417 
418 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
419 
420 			st_dec_output = svcreate3_u8(output0, output1, output2);
421 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
422 
423 			pred_count = svcntp_b8(pred, pred);
424 			*src += pred_count * 4;
425 			*dst += pred_count * 3;
426 			*src_len -= pred_count * 4;
427 			i += pred_count * 4;
428 
429 		}
430 	} else if (vl >= 128) {
431 		pred = svwhilelt_b8((uint64_t)0, (uint64_t)128);
432 		tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0);
433 
434 		while (i < N) {
435 			pred = svwhilelt_b8(i / 4, N / 4);
436 
437 			ld_dec_input = svld4_u8(pred, *src);
438 
439 			str0 = svget4_u8(ld_dec_input, 0);
440 			str1 = svget4_u8(ld_dec_input, 1);
441 			str2 = svget4_u8(ld_dec_input, 2);
442 			str3 = svget4_u8(ld_dec_input, 3);
443 
444 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
445 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
446 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
447 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
448 
449 			temp0 = svtbl_u8(tbl_dec0, str0);
450 			temp1 = svtbl_u8(tbl_dec0, str1);
451 			temp2 = svtbl_u8(tbl_dec0, str2);
452 			temp3 = svtbl_u8(tbl_dec0, str3);
453 
454 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; }
455 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; }
456 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; }
457 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; }
458 
459 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
460 
461 			st_dec_output = svcreate3_u8(output0, output1, output2);
462 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
463 
464 			pred_count = svcntp_b8(pred, pred);
465 			*src += pred_count * 4;
466 			*dst += pred_count * 3;
467 			*src_len -= pred_count * 4;
468 			i += pred_count * 4;
469 
470 		}
471 	}
472 }
473