xref: /spdk/lib/util/base64_sve.c (revision 7506a7aa53d239f533af3bc768f0d2af55e735fe)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) ARM Limited. 2021 All rights reserved.
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions are
8  *   met:
9  *
10  *     * Redistributions of source code must retain the above copyright
11  *       notice, this list of conditions and the following disclaimer.
12  *     * Redistributions in binary form must reproduce the above copyright
13  *       notice, this list of conditions and the following disclaimer in
14  *       the documentation and/or other materials provided with the
15  *       distribution.
16  *     * Neither the name of Intel Corporation nor the names of its
17  *       contributors may be used to endorse or promote products derived
18  *       from this software without specific prior written permission.
19  *
20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21  *   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  *   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23  *   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  *   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifndef __aarch64__
34 #error Unsupported hardware
35 #endif
36 
37 #include "spdk/stdinc.h"
38 #include <arm_sve.h>
39 
40 static int
41 table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
42 		  svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7,
43 		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
44 {
45 	svuint8_t res2, res3, res4, res5, res6, res7;
46 
47 	/*
48 	 * In base64 decode table, the first 32 elements are invalid value,
49 	 * so skip tbl_vec0 and tbl_vec1
50 	 */
51 	indices = svsub_n_u8_z(p8_in, indices, 2 * vl);
52 	res2 = svtbl_u8(tbl_vec2, indices);
53 	indices = svsub_n_u8_z(p8_in, indices, vl);
54 	res3 = svtbl_u8(tbl_vec3, indices);
55 	indices = svsub_n_u8_z(p8_in, indices, vl);
56 	res4 = svtbl_u8(tbl_vec4, indices);
57 	indices = svsub_n_u8_z(p8_in, indices, vl);
58 	res5 = svtbl_u8(tbl_vec5, indices);
59 	indices = svsub_n_u8_z(p8_in, indices, vl);
60 	res6 = svtbl_u8(tbl_vec6, indices);
61 	indices = svsub_n_u8_z(p8_in, indices, vl);
62 	res7 = svtbl_u8(tbl_vec7, indices);
63 
64 	*output = svdup_n_u8(0);
65 	*output = svadd_u8_z(p8_in, res2, *output);
66 	*output = svadd_u8_z(p8_in, res3, *output);
67 	*output = svadd_u8_z(p8_in, res4, *output);
68 	*output = svadd_u8_z(p8_in, res5, *output);
69 	*output = svadd_u8_z(p8_in, res6, *output);
70 	*output = svadd_u8_z(p8_in, res7, *output);
71 
72 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
73 		return -1;
74 	}
75 
76 	return 0;
77 }
78 
79 static int
80 table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
81 		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
82 {
83 	svuint8_t res0, res1, res2, res3;
84 
85 	res0 = svtbl_u8(tbl_vec0, indices);
86 	indices = svsub_n_u8_z(p8_in, indices, vl);
87 	res1 = svtbl_u8(tbl_vec1, indices);
88 	indices = svsub_n_u8_z(p8_in, indices, vl);
89 	res2 = svtbl_u8(tbl_vec2, indices);
90 	indices = svsub_n_u8_z(p8_in, indices, vl);
91 	res3 = svtbl_u8(tbl_vec3, indices);
92 
93 	*output = svdup_n_u8(0);
94 
95 	*output = svadd_u8_z(p8_in, res0, *output);
96 	*output = svadd_u8_z(p8_in, res1, *output);
97 	*output = svadd_u8_z(p8_in, res2, *output);
98 	*output = svadd_u8_z(p8_in, res3, *output);
99 
100 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
101 		return -1;
102 	}
103 
104 	return 0;
105 }
106 
107 static int
108 table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices,
109 		  svuint8_t *output, svbool_t p8_in, uint64_t vl)
110 {
111 	svuint8_t res0, res1, res2;
112 
113 	res0 = svtbl_u8(tbl_vec0, indices);
114 	indices = svsub_n_u8_z(p8_in, indices, vl);
115 	res1 = svtbl_u8(tbl_vec1, indices);
116 	indices = svsub_n_u8_z(p8_in, indices, vl);
117 	res2 = svtbl_u8(tbl_vec2, indices);
118 
119 	*output = svdup_n_u8(0);
120 
121 	*output = svadd_u8_z(p8_in, res0, *output);
122 	*output = svadd_u8_z(p8_in, res1, *output);
123 	*output = svadd_u8_z(p8_in, res2, *output);
124 
125 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
126 		return -1;
127 	}
128 
129 	return 0;
130 }
131 
132 static int
133 table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output,
134 		  svbool_t p8_in, uint64_t vl)
135 {
136 	svuint8_t res0, res1;
137 
138 	res0 = svtbl_u8(tbl_vec0, indices);
139 	indices = svsub_n_u8_z(p8_in, indices, vl);
140 	res1 = svtbl_u8(tbl_vec1, indices);
141 
142 	*output = svdup_n_u8(0);
143 
144 	*output = svadd_u8_z(p8_in, res0, *output);
145 	*output = svadd_u8_z(p8_in, res1, *output);
146 
147 	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
148 		return -1;
149 	}
150 
151 	return 0;
152 }
153 
154 static inline void
155 convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1,
156 		       svuint8_t *temp2, svuint8_t *temp3)
157 {
158 	svuint8_t str0, str1, str2;
159 	svuint8x3_t ld_enc_input;
160 
161 	ld_enc_input = svld3_u8(pred, src);
162 
163 	str0 = svget3_u8(ld_enc_input, 0);
164 	str1 = svget3_u8(ld_enc_input, 1);
165 	str2 = svget3_u8(ld_enc_input, 2);
166 
167 
168 	*temp0 = svlsr_n_u8_z(pred, str0, 2);
169 	*temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0,
170 					     4)),
171 			    svdup_u8(0x3F));
172 	*temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1,
173 					     2)),
174 			    svdup_u8(0x3F));
175 	*temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F));
176 }
177 
178 static inline void
179 convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2,
180 		       svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2)
181 {
182 	*output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4));
183 	*output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2));
184 	*output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3);
185 }
186 
187 static void
188 base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len)
189 {
190 	uint64_t vl = svcntb();
191 	svuint8_t temp0, temp1, temp2, temp3;
192 	svuint8_t output0, output1, output2, output3;
193 	svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3;
194 	svuint8x4_t st_enc_output;
195 	svbool_t p8_all = svptrue_b8();
196 	svbool_t pred;
197 	uint64_t i = 0;
198 	uint64_t pred_count = 0;
199 	uint64_t N = (*src_len / 3) * 3;
200 
201 	if (vl == 16) {
202 
203 		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
204 		tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16);
205 		tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32);
206 		tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48);
207 
208 		while (i < N) {
209 			pred = svwhilelt_b8(i / 3, N / 3);
210 
211 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
212 
213 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl);
214 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl);
215 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl);
216 			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl);
217 
218 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
219 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
220 
221 			pred_count = svcntp_b8(pred, pred);
222 			*src += pred_count * 3;
223 			*dst += pred_count * 4;
224 			*src_len -= pred_count * 3;
225 			i += pred_count * 3;
226 
227 		}
228 	} else if (vl == 32 || vl == 48) {
229 
230 		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
231 		pred = svwhilelt_b8(vl, (uint64_t)64);
232 		tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl);
233 
234 		while (i < N) {
235 			pred = svwhilelt_b8(i / 3, N / 3);
236 
237 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
238 
239 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl);
240 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl);
241 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl);
242 			table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl);
243 
244 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
245 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
246 
247 			pred_count = svcntp_b8(pred, pred);
248 			*src += pred_count * 3;
249 			*dst += pred_count * 4;
250 			*src_len -= pred_count * 3;
251 			i += pred_count * 3;
252 
253 		}
254 	} else if (vl >= 64) {
255 
256 		pred = svwhilelt_b8((uint64_t)0, (uint64_t)64);
257 		tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table);
258 
259 		while (i < N) {
260 			pred = svwhilelt_b8(i / 3, N / 3);
261 
262 			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
263 
264 			output0 = svtbl_u8(tbl_enc0, temp0);
265 			output1 = svtbl_u8(tbl_enc0, temp1);
266 			output2 = svtbl_u8(tbl_enc0, temp2);
267 			output3 = svtbl_u8(tbl_enc0, temp3);
268 
269 			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
270 			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
271 
272 			pred_count = svcntp_b8(pred, pred);
273 			*src += pred_count * 3;
274 			*dst += pred_count * 4;
275 			*src_len -= pred_count * 3;
276 			i += pred_count * 3;
277 
278 		}
279 	}
280 }
281 
282 static void
283 base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len)
284 {
285 	uint64_t vl = svcntb();
286 	svuint8_t str0, str1, str2, str3;
287 	svuint8_t temp0, temp1, temp2, temp3;
288 	svuint8_t output0, output1, output2;
289 	svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7;
290 	svuint8x3_t st_dec_output;
291 	svbool_t p8_all = svptrue_b8();
292 	svbool_t pred;
293 	uint64_t i = 0;
294 	uint64_t pred_count = 0;
295 	uint64_t N = (*src_len / 4) * 4;
296 	svuint8x4_t ld_dec_input;
297 
298 	if (vl == 16) {
299 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
300 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16);
301 		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32);
302 		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48);
303 		tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64);
304 		tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80);
305 		tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96);
306 		tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112);
307 
308 		while (i < N) {
309 			pred = svwhilelt_b8(i / 4, N / 4);
310 
311 			ld_dec_input = svld4_u8(pred, *src);
312 
313 			str0 = svget4_u8(ld_dec_input, 0);
314 			str1 = svget4_u8(ld_dec_input, 1);
315 			str2 = svget4_u8(ld_dec_input, 2);
316 			str3 = svget4_u8(ld_dec_input, 3);
317 
318 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
319 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
320 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
321 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
322 
323 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
324 					      tbl_dec7, str0, &temp0, pred, vl)) { return; }
325 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
326 					      tbl_dec7, str1, &temp1, pred, vl)) { return; }
327 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
328 					      tbl_dec7, str2, &temp2, pred, vl)) { return; }
329 			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
330 					      tbl_dec7, str3, &temp3, pred, vl)) { return; }
331 
332 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
333 
334 			st_dec_output = svcreate3_u8(output0, output1, output2);
335 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
336 
337 			pred_count = svcntp_b8(pred, pred);
338 			*src += pred_count * 4;
339 			*dst += pred_count * 3;
340 			*src_len -= pred_count * 4;
341 			i += pred_count * 4;
342 
343 		}
344 	} else if (vl == 32) {
345 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
346 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
347 		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2);
348 		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3);
349 
350 		while (i < N) {
351 			pred = svwhilelt_b8(i / 4, N / 4);
352 
353 			ld_dec_input = svld4_u8(pred, *src);
354 
355 			str0 = svget4_u8(ld_dec_input, 0);
356 			str1 = svget4_u8(ld_dec_input, 1);
357 			str2 = svget4_u8(ld_dec_input, 2);
358 			str3 = svget4_u8(ld_dec_input, 3);
359 
360 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
361 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
362 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
363 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
364 
365 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; }
366 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; }
367 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; }
368 			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; }
369 
370 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
371 
372 			st_dec_output = svcreate3_u8(output0, output1, output2);
373 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
374 
375 			pred_count = svcntp_b8(pred, pred);
376 			*src += pred_count * 4;
377 			*dst += pred_count * 3;
378 			*src_len -= pred_count * 4;
379 			i += pred_count * 4;
380 
381 		}
382 
383 	} else if (vl == 48) {
384 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
385 		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
386 		pred = svwhilelt_b8(vl * 2, (uint64_t)128);
387 		tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl);
388 
389 		while (i < N) {
390 			pred = svwhilelt_b8(i / 4, N / 4);
391 
392 			ld_dec_input = svld4_u8(pred, *src);
393 
394 			str0 = svget4_u8(ld_dec_input, 0);
395 			str1 = svget4_u8(ld_dec_input, 1);
396 			str2 = svget4_u8(ld_dec_input, 2);
397 			str3 = svget4_u8(ld_dec_input, 3);
398 
399 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
400 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
401 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
402 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
403 
404 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; }
405 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; }
406 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; }
407 			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; }
408 
409 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
410 
411 			st_dec_output = svcreate3_u8(output0, output1, output2);
412 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
413 
414 			pred_count = svcntp_b8(pred, pred);
415 			*src += pred_count * 4;
416 			*dst += pred_count * 3;
417 			*src_len -= pred_count * 4;
418 			i += pred_count * 4;
419 
420 		}
421 	} else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) {
422 		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
423 		pred = svwhilelt_b8(vl, (uint64_t)128);
424 		tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl);
425 
426 		while (i < N) {
427 			pred = svwhilelt_b8(i / 4, N / 4);
428 
429 			ld_dec_input = svld4_u8(pred, *src);
430 
431 			str0 = svget4_u8(ld_dec_input, 0);
432 			str1 = svget4_u8(ld_dec_input, 1);
433 			str2 = svget4_u8(ld_dec_input, 2);
434 			str3 = svget4_u8(ld_dec_input, 3);
435 
436 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
437 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
438 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
439 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
440 
441 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; }
442 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; }
443 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; }
444 			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; }
445 
446 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
447 
448 			st_dec_output = svcreate3_u8(output0, output1, output2);
449 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
450 
451 			pred_count = svcntp_b8(pred, pred);
452 			*src += pred_count * 4;
453 			*dst += pred_count * 3;
454 			*src_len -= pred_count * 4;
455 			i += pred_count * 4;
456 
457 		}
458 	} else if (vl >= 128) {
459 		pred = svwhilelt_b8((uint64_t)0, (uint64_t)128);
460 		tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0);
461 
462 		while (i < N) {
463 			pred = svwhilelt_b8(i / 4, N / 4);
464 
465 			ld_dec_input = svld4_u8(pred, *src);
466 
467 			str0 = svget4_u8(ld_dec_input, 0);
468 			str1 = svget4_u8(ld_dec_input, 1);
469 			str2 = svget4_u8(ld_dec_input, 2);
470 			str3 = svget4_u8(ld_dec_input, 3);
471 
472 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
473 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
474 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
475 			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
476 
477 			temp0 = svtbl_u8(tbl_dec0, str0);
478 			temp1 = svtbl_u8(tbl_dec0, str1);
479 			temp2 = svtbl_u8(tbl_dec0, str2);
480 			temp3 = svtbl_u8(tbl_dec0, str3);
481 
482 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; }
483 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; }
484 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; }
485 			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; }
486 
487 			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
488 
489 			st_dec_output = svcreate3_u8(output0, output1, output2);
490 			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
491 
492 			pred_count = svcntp_b8(pred, pred);
493 			*src += pred_count * 4;
494 			*dst += pred_count * 3;
495 			*src_len -= pred_count * 4;
496 			i += pred_count * 4;
497 
498 		}
499 	}
500 }
501