xref: /spdk/lib/util/base64_sve.c (revision b02581a89058ebaebe03bd0e16e3b58adfe406c1)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   Copyright(c) ARM Limited. 2021 All rights reserved.
4  */
5 
6 #ifndef __aarch64__
7 #error Unsupported hardware
8 #endif
9 
10 #include "spdk/stdinc.h"
11 #include <arm_sve.h>
12 
13 static int
14 table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
15 		  svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7,
16 		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
17 {
18 	svuint8_t res2, res3, res4, res5, res6, res7;
19 
20 	/*
21 	 * In base64 decode table, the first 32 elements are invalid value,
22 	 * so skip tbl_vec0 and tbl_vec1
23 	 */
24 	indices = svsub_n_u8_z(p8_in, indices, 2 * vl);
25 	res2 = svtbl_u8(tbl_vec2, indices);
26 	indices = svsub_n_u8_z(p8_in, indices, vl);
27 	res3 = svtbl_u8(tbl_vec3, indices);
28 	indices = svsub_n_u8_z(p8_in, indices, vl);
29 	res4 = svtbl_u8(tbl_vec4, indices);
30 	indices = svsub_n_u8_z(p8_in, indices, vl);
31 	res5 = svtbl_u8(tbl_vec5, indices);
32 	indices = svsub_n_u8_z(p8_in, indices, vl);
33 	res6 = svtbl_u8(tbl_vec6, indices);
34 	indices = svsub_n_u8_z(p8_in, indices, vl);
35 	res7 = svtbl_u8(tbl_vec7, indices);
36 
37 	*output = svdup_n_u8(0);
38 	*output = svadd_u8_z(p8_in, res2, *output);
39 	*output = svadd_u8_z(p8_in, res3, *output);
40 	*output = svadd_u8_z(p8_in, res4, *output);
41 	*output = svadd_u8_z(p8_in, res5, *output);
42 	*output = svadd_u8_z(p8_in, res6, *output);
43 	*output = svadd_u8_z(p8_in, res7, *output);
44 
45 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
46 		return -1;
47 	}
48 
49 	return 0;
50 }
51 
52 static int
53 table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
54 		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
55 {
56 	svuint8_t res0, res1, res2, res3;
57 
58 	res0 = svtbl_u8(tbl_vec0, indices);
59 	indices = svsub_n_u8_z(p8_in, indices, vl);
60 	res1 = svtbl_u8(tbl_vec1, indices);
61 	indices = svsub_n_u8_z(p8_in, indices, vl);
62 	res2 = svtbl_u8(tbl_vec2, indices);
63 	indices = svsub_n_u8_z(p8_in, indices, vl);
64 	res3 = svtbl_u8(tbl_vec3, indices);
65 
66 	*output = svdup_n_u8(0);
67 
68 	*output = svadd_u8_z(p8_in, res0, *output);
69 	*output = svadd_u8_z(p8_in, res1, *output);
70 	*output = svadd_u8_z(p8_in, res2, *output);
71 	*output = svadd_u8_z(p8_in, res3, *output);
72 
73 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
74 		return -1;
75 	}
76 
77 	return 0;
78 }
79 
80 static int
81 table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices,
82 		  svuint8_t *output, svbool_t p8_in, uint64_t vl)
83 {
84 	svuint8_t res0, res1, res2;
85 
86 	res0 = svtbl_u8(tbl_vec0, indices);
87 	indices = svsub_n_u8_z(p8_in, indices, vl);
88 	res1 = svtbl_u8(tbl_vec1, indices);
89 	indices = svsub_n_u8_z(p8_in, indices, vl);
90 	res2 = svtbl_u8(tbl_vec2, indices);
91 
92 	*output = svdup_n_u8(0);
93 
94 	*output = svadd_u8_z(p8_in, res0, *output);
95 	*output = svadd_u8_z(p8_in, res1, *output);
96 	*output = svadd_u8_z(p8_in, res2, *output);
97 
98 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
99 		return -1;
100 	}
101 
102 	return 0;
103 }
104 
105 static int
106 table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output,
107 		  svbool_t p8_in, uint64_t vl)
108 {
109 	svuint8_t res0, res1;
110 
111 	res0 = svtbl_u8(tbl_vec0, indices);
112 	indices = svsub_n_u8_z(p8_in, indices, vl);
113 	res1 = svtbl_u8(tbl_vec1, indices);
114 
115 	*output = svdup_n_u8(0);
116 
117 	*output = svadd_u8_z(p8_in, res0, *output);
118 	*output = svadd_u8_z(p8_in, res1, *output);
119 
120 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
121 		return -1;
122 	}
123 
124 	return 0;
125 }
126 
127 static inline void
128 convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1,
129 		       svuint8_t *temp2, svuint8_t *temp3)
130 {
131 	svuint8_t str0, str1, str2;
132 	svuint8x3_t ld_enc_input;
133 
134 	ld_enc_input = svld3_u8(pred, src);
135 
136 	str0 = svget3_u8(ld_enc_input, 0);
137 	str1 = svget3_u8(ld_enc_input, 1);
138 	str2 = svget3_u8(ld_enc_input, 2);
139 
140 
141 	*temp0 = svlsr_n_u8_z(pred, str0, 2);
142 	*temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0,
143 					     4)),
144 			    svdup_u8(0x3F));
145 	*temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1,
146 					     2)),
147 			    svdup_u8(0x3F));
148 	*temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F));
149 }
150 
151 static inline void
152 convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2,
153 		       svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2)
154 {
155 	*output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4));
156 	*output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2));
157 	*output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3);
158 }
159 
160 static void
161 base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len)
162 {
163 	uint64_t vl = svcntb();
164 	svuint8_t temp0, temp1, temp2, temp3;
165 	svuint8_t output0, output1, output2, output3;
166 	svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3;
167 	svuint8x4_t st_enc_output;
168 	svbool_t p8_all = svptrue_b8();
169 	svbool_t pred;
170 	uint64_t i = 0;
171 	uint64_t pred_count = 0;
172 	uint64_t N = (*src_len / 3) * 3;
173 
174 	if (vl == 16) {
175 
176 		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
177 		tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16);
178 		tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32);
179 		tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48);
180 
181 		while (i < N) {
182 			pred = svwhilelt_b8(i / 3, N / 3);
183 
184 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
185 
186 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl);
187 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl);
188 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl);
189 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl);
190 
191 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
192 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
193 
194 			pred_count = svcntp_b8(pred, pred);
195 			*src = (uint8_t *)*src + pred_count * 3;
196 			*dst += pred_count * 4;
197 			*src_len -= pred_count * 3;
198 			i += pred_count * 3;
199 
200 		}
201 	} else if (vl == 32 || vl == 48) {
202 
203 		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
204 		pred = svwhilelt_b8(vl, (uint64_t)64);
205 		tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl);
206 
207 		while (i < N) {
208 			pred = svwhilelt_b8(i / 3, N / 3);
209 
210 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
211 
212 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl);
213 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl);
214 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl);
215 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl);
216 
217 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
218 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
219 
220 			pred_count = svcntp_b8(pred, pred);
221 			*src = (uint8_t *)*src + pred_count * 3;
222 			*dst += pred_count * 4;
223 			*src_len -= pred_count * 3;
224 			i += pred_count * 3;
225 
226 		}
227 	} else if (vl >= 64) {
228 
229 		pred = svwhilelt_b8((uint64_t)0, (uint64_t)64);
230 		tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table);
231 
232 		while (i < N) {
233 			pred = svwhilelt_b8(i / 3, N / 3);
234 
235 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
236 
237 			output0 = svtbl_u8(tbl_enc0, temp0);
238 			output1 = svtbl_u8(tbl_enc0, temp1);
239 			output2 = svtbl_u8(tbl_enc0, temp2);
240 			output3 = svtbl_u8(tbl_enc0, temp3);
241 
242 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
243 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
244 
245 			pred_count = svcntp_b8(pred, pred);
246 			*src = (uint8_t *)*src + pred_count * 3;
247 			*dst += pred_count * 4;
248 			*src_len -= pred_count * 3;
249 			i += pred_count * 3;
250 
251 		}
252 	}
253 }
254 
255 static void
256 base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len)
257 {
258 	uint64_t vl = svcntb();
259 	svuint8_t str0, str1, str2, str3;
260 	svuint8_t temp0, temp1, temp2, temp3;
261 	svuint8_t output0, output1, output2;
262 	svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7;
263 	svuint8x3_t st_dec_output;
264 	svbool_t p8_all = svptrue_b8();
265 	svbool_t pred;
266 	uint64_t i = 0;
267 	uint64_t pred_count = 0;
268 	uint64_t N = (*src_len / 4) * 4;
269 	svuint8x4_t ld_dec_input;
270 
271 	if (vl == 16) {
272 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
273 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16);
274 		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32);
275 		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48);
276 		tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64);
277 		tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80);
278 		tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96);
279 		tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112);
280 
281 		while (i < N) {
282 			pred = svwhilelt_b8(i / 4, N / 4);
283 
284 			ld_dec_input = svld4_u8(pred, *src);
285 
286 			str0 = svget4_u8(ld_dec_input, 0);
287 			str1 = svget4_u8(ld_dec_input, 1);
288 			str2 = svget4_u8(ld_dec_input, 2);
289 			str3 = svget4_u8(ld_dec_input, 3);
290 
291 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
292 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
293 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
294 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
295 
296 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
297 					      tbl_dec7, str0, &temp0, pred, vl)) { return; }
298 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
299 					      tbl_dec7, str1, &temp1, pred, vl)) { return; }
300 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
301 					      tbl_dec7, str2, &temp2, pred, vl)) { return; }
302 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
303 					      tbl_dec7, str3, &temp3, pred, vl)) { return; }
304 
305 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
306 
307 			st_dec_output = svcreate3_u8(output0, output1, output2);
308 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
309 
310 			pred_count = svcntp_b8(pred, pred);
311 			*src += pred_count * 4;
312 			*dst = (uint8_t *)*dst + pred_count * 3;
313 			*src_len -= pred_count * 4;
314 			i += pred_count * 4;
315 
316 		}
317 	} else if (vl == 32) {
318 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
319 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
320 		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2);
321 		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3);
322 
323 		while (i < N) {
324 			pred = svwhilelt_b8(i / 4, N / 4);
325 
326 			ld_dec_input = svld4_u8(pred, *src);
327 
328 			str0 = svget4_u8(ld_dec_input, 0);
329 			str1 = svget4_u8(ld_dec_input, 1);
330 			str2 = svget4_u8(ld_dec_input, 2);
331 			str3 = svget4_u8(ld_dec_input, 3);
332 
333 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
334 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
335 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
336 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
337 
338 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; }
339 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; }
340 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; }
341 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; }
342 
343 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
344 
345 			st_dec_output = svcreate3_u8(output0, output1, output2);
346 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
347 
348 			pred_count = svcntp_b8(pred, pred);
349 			*src += pred_count * 4;
350 			*dst = (uint8_t *)*dst + pred_count * 3;
351 			*src_len -= pred_count * 4;
352 			i += pred_count * 4;
353 
354 		}
355 
356 	} else if (vl == 48) {
357 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
358 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
359 		pred = svwhilelt_b8(vl * 2, (uint64_t)128);
360 		tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl);
361 
362 		while (i < N) {
363 			pred = svwhilelt_b8(i / 4, N / 4);
364 
365 			ld_dec_input = svld4_u8(pred, *src);
366 
367 			str0 = svget4_u8(ld_dec_input, 0);
368 			str1 = svget4_u8(ld_dec_input, 1);
369 			str2 = svget4_u8(ld_dec_input, 2);
370 			str3 = svget4_u8(ld_dec_input, 3);
371 
372 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
373 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
374 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
375 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
376 
377 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; }
378 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; }
379 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; }
380 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; }
381 
382 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
383 
384 			st_dec_output = svcreate3_u8(output0, output1, output2);
385 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
386 
387 			pred_count = svcntp_b8(pred, pred);
388 			*src += pred_count * 4;
389 			*dst = (uint8_t *)*dst + pred_count * 3;
390 			*src_len -= pred_count * 4;
391 			i += pred_count * 4;
392 
393 		}
394 	} else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) {
395 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
396 		pred = svwhilelt_b8(vl, (uint64_t)128);
397 		tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl);
398 
399 		while (i < N) {
400 			pred = svwhilelt_b8(i / 4, N / 4);
401 
402 			ld_dec_input = svld4_u8(pred, *src);
403 
404 			str0 = svget4_u8(ld_dec_input, 0);
405 			str1 = svget4_u8(ld_dec_input, 1);
406 			str2 = svget4_u8(ld_dec_input, 2);
407 			str3 = svget4_u8(ld_dec_input, 3);
408 
409 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
410 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
411 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
412 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
413 
414 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; }
415 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; }
416 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; }
417 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; }
418 
419 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
420 
421 			st_dec_output = svcreate3_u8(output0, output1, output2);
422 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
423 
424 			pred_count = svcntp_b8(pred, pred);
425 			*src += pred_count * 4;
426 			*dst = (uint8_t *)*dst + pred_count * 3;
427 			*src_len -= pred_count * 4;
428 			i += pred_count * 4;
429 
430 		}
431 	} else if (vl >= 128) {
432 		pred = svwhilelt_b8((uint64_t)0, (uint64_t)128);
433 		tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0);
434 
435 		while (i < N) {
436 			pred = svwhilelt_b8(i / 4, N / 4);
437 
438 			ld_dec_input = svld4_u8(pred, *src);
439 
440 			str0 = svget4_u8(ld_dec_input, 0);
441 			str1 = svget4_u8(ld_dec_input, 1);
442 			str2 = svget4_u8(ld_dec_input, 2);
443 			str3 = svget4_u8(ld_dec_input, 3);
444 
445 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
446 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
447 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
448 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
449 
450 			temp0 = svtbl_u8(tbl_dec0, str0);
451 			temp1 = svtbl_u8(tbl_dec0, str1);
452 			temp2 = svtbl_u8(tbl_dec0, str2);
453 			temp3 = svtbl_u8(tbl_dec0, str3);
454 
455 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; }
456 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; }
457 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; }
458 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; }
459 
460 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
461 
462 			st_dec_output = svcreate3_u8(output0, output1, output2);
463 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
464 
465 			pred_count = svcntp_b8(pred, pred);
466 			*src += pred_count * 4;
467 			*dst = (uint8_t *)*dst + pred_count * 3;
468 			*src_len -= pred_count * 4;
469 			i += pred_count * 4;
470 
471 		}
472 	}
473 }
474