1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (C) 2022 Intel Corporation.
3 * Copyright(c) ARM Limited. 2021 All rights reserved.
4 */
5
6 #ifndef __aarch64__
7 #error Unsupported hardware
8 #endif
9
10 #include "spdk/stdinc.h"
11 #include <arm_sve.h>
12
13 static int
table_lookup_8vec(svuint8_t tbl_vec0,svuint8_t tbl_vec1,svuint8_t tbl_vec2,svuint8_t tbl_vec3,svuint8_t tbl_vec4,svuint8_t tbl_vec5,svuint8_t tbl_vec6,svuint8_t tbl_vec7,svuint8_t indices,svuint8_t * output,svbool_t p8_in,uint64_t vl)14 table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
15 svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7,
16 svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
17 {
18 svuint8_t res2, res3, res4, res5, res6, res7;
19
20 /*
21 * In base64 decode table, the first 32 elements are invalid value,
22 * so skip tbl_vec0 and tbl_vec1
23 */
24 indices = svsub_n_u8_z(p8_in, indices, 2 * vl);
25 res2 = svtbl_u8(tbl_vec2, indices);
26 indices = svsub_n_u8_z(p8_in, indices, vl);
27 res3 = svtbl_u8(tbl_vec3, indices);
28 indices = svsub_n_u8_z(p8_in, indices, vl);
29 res4 = svtbl_u8(tbl_vec4, indices);
30 indices = svsub_n_u8_z(p8_in, indices, vl);
31 res5 = svtbl_u8(tbl_vec5, indices);
32 indices = svsub_n_u8_z(p8_in, indices, vl);
33 res6 = svtbl_u8(tbl_vec6, indices);
34 indices = svsub_n_u8_z(p8_in, indices, vl);
35 res7 = svtbl_u8(tbl_vec7, indices);
36
37 *output = svdup_n_u8(0);
38 *output = svadd_u8_z(p8_in, res2, *output);
39 *output = svadd_u8_z(p8_in, res3, *output);
40 *output = svadd_u8_z(p8_in, res4, *output);
41 *output = svadd_u8_z(p8_in, res5, *output);
42 *output = svadd_u8_z(p8_in, res6, *output);
43 *output = svadd_u8_z(p8_in, res7, *output);
44
45 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
46 return -1;
47 }
48
49 return 0;
50 }
51
52 static int
table_lookup_4vec(svuint8_t tbl_vec0,svuint8_t tbl_vec1,svuint8_t tbl_vec2,svuint8_t tbl_vec3,svuint8_t indices,svuint8_t * output,svbool_t p8_in,uint64_t vl)53 table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
54 svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
55 {
56 svuint8_t res0, res1, res2, res3;
57
58 res0 = svtbl_u8(tbl_vec0, indices);
59 indices = svsub_n_u8_z(p8_in, indices, vl);
60 res1 = svtbl_u8(tbl_vec1, indices);
61 indices = svsub_n_u8_z(p8_in, indices, vl);
62 res2 = svtbl_u8(tbl_vec2, indices);
63 indices = svsub_n_u8_z(p8_in, indices, vl);
64 res3 = svtbl_u8(tbl_vec3, indices);
65
66 *output = svdup_n_u8(0);
67
68 *output = svadd_u8_z(p8_in, res0, *output);
69 *output = svadd_u8_z(p8_in, res1, *output);
70 *output = svadd_u8_z(p8_in, res2, *output);
71 *output = svadd_u8_z(p8_in, res3, *output);
72
73 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
74 return -1;
75 }
76
77 return 0;
78 }
79
80 static int
table_lookup_3vec(svuint8_t tbl_vec0,svuint8_t tbl_vec1,svuint8_t tbl_vec2,svuint8_t indices,svuint8_t * output,svbool_t p8_in,uint64_t vl)81 table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices,
82 svuint8_t *output, svbool_t p8_in, uint64_t vl)
83 {
84 svuint8_t res0, res1, res2;
85
86 res0 = svtbl_u8(tbl_vec0, indices);
87 indices = svsub_n_u8_z(p8_in, indices, vl);
88 res1 = svtbl_u8(tbl_vec1, indices);
89 indices = svsub_n_u8_z(p8_in, indices, vl);
90 res2 = svtbl_u8(tbl_vec2, indices);
91
92 *output = svdup_n_u8(0);
93
94 *output = svadd_u8_z(p8_in, res0, *output);
95 *output = svadd_u8_z(p8_in, res1, *output);
96 *output = svadd_u8_z(p8_in, res2, *output);
97
98 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
99 return -1;
100 }
101
102 return 0;
103 }
104
105 static int
table_lookup_2vec(svuint8_t tbl_vec0,svuint8_t tbl_vec1,svuint8_t indices,svuint8_t * output,svbool_t p8_in,uint64_t vl)106 table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output,
107 svbool_t p8_in, uint64_t vl)
108 {
109 svuint8_t res0, res1;
110
111 res0 = svtbl_u8(tbl_vec0, indices);
112 indices = svsub_n_u8_z(p8_in, indices, vl);
113 res1 = svtbl_u8(tbl_vec1, indices);
114
115 *output = svdup_n_u8(0);
116
117 *output = svadd_u8_z(p8_in, res0, *output);
118 *output = svadd_u8_z(p8_in, res1, *output);
119
120 if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
121 return -1;
122 }
123
124 return 0;
125 }
126
127 static inline void
convert_6bits_to_8bits(svbool_t pred,uint8_t * src,svuint8_t * temp0,svuint8_t * temp1,svuint8_t * temp2,svuint8_t * temp3)128 convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1,
129 svuint8_t *temp2, svuint8_t *temp3)
130 {
131 svuint8_t str0, str1, str2;
132 svuint8x3_t ld_enc_input;
133
134 ld_enc_input = svld3_u8(pred, src);
135
136 str0 = svget3_u8(ld_enc_input, 0);
137 str1 = svget3_u8(ld_enc_input, 1);
138 str2 = svget3_u8(ld_enc_input, 2);
139
140
141 *temp0 = svlsr_n_u8_z(pred, str0, 2);
142 *temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0,
143 4)),
144 svdup_u8(0x3F));
145 *temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1,
146 2)),
147 svdup_u8(0x3F));
148 *temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F));
149 }
150
151 static inline void
convert_8bits_to_6bits(svbool_t pred,svuint8_t temp0,svuint8_t temp1,svuint8_t temp2,svuint8_t temp3,svuint8_t * output0,svuint8_t * output1,svuint8_t * output2)152 convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2,
153 svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2)
154 {
155 *output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4));
156 *output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2));
157 *output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3);
158 }
159
160 static void
base64_encode_sve(char ** dst,const char * enc_table,const void ** src,size_t * src_len)161 base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len)
162 {
163 uint64_t vl = svcntb();
164 svuint8_t temp0, temp1, temp2, temp3;
165 svuint8_t output0, output1, output2, output3;
166 svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3;
167 svuint8x4_t st_enc_output;
168 svbool_t p8_all = svptrue_b8();
169 svbool_t pred;
170 uint64_t i = 0;
171 uint64_t pred_count = 0;
172 uint64_t N = (*src_len / 3) * 3;
173
174 if (vl == 16) {
175
176 tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
177 tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16);
178 tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32);
179 tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48);
180
181 while (i < N) {
182 pred = svwhilelt_b8(i / 3, N / 3);
183
184 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
185
186 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl);
187 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl);
188 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl);
189 table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl);
190
191 st_enc_output = svcreate4_u8(output0, output1, output2, output3);
192 svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
193
194 pred_count = svcntp_b8(pred, pred);
195 *src = (uint8_t *)*src + pred_count * 3;
196 *dst += pred_count * 4;
197 *src_len -= pred_count * 3;
198 i += pred_count * 3;
199
200 }
201 } else if (vl == 32 || vl == 48) {
202
203 tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
204 pred = svwhilelt_b8(vl, (uint64_t)64);
205 tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl);
206
207 while (i < N) {
208 pred = svwhilelt_b8(i / 3, N / 3);
209
210 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
211
212 table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl);
213 table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl);
214 table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl);
215 table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl);
216
217 st_enc_output = svcreate4_u8(output0, output1, output2, output3);
218 svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
219
220 pred_count = svcntp_b8(pred, pred);
221 *src = (uint8_t *)*src + pred_count * 3;
222 *dst += pred_count * 4;
223 *src_len -= pred_count * 3;
224 i += pred_count * 3;
225
226 }
227 } else if (vl >= 64) {
228
229 pred = svwhilelt_b8((uint64_t)0, (uint64_t)64);
230 tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table);
231
232 while (i < N) {
233 pred = svwhilelt_b8(i / 3, N / 3);
234
235 convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
236
237 output0 = svtbl_u8(tbl_enc0, temp0);
238 output1 = svtbl_u8(tbl_enc0, temp1);
239 output2 = svtbl_u8(tbl_enc0, temp2);
240 output3 = svtbl_u8(tbl_enc0, temp3);
241
242 st_enc_output = svcreate4_u8(output0, output1, output2, output3);
243 svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
244
245 pred_count = svcntp_b8(pred, pred);
246 *src = (uint8_t *)*src + pred_count * 3;
247 *dst += pred_count * 4;
248 *src_len -= pred_count * 3;
249 i += pred_count * 3;
250
251 }
252 }
253 }
254
255 static void
base64_decode_sve(void ** dst,const uint8_t * dec_table,const uint8_t ** src,size_t * src_len)256 base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len)
257 {
258 uint64_t vl = svcntb();
259 svuint8_t str0, str1, str2, str3;
260 svuint8_t temp0, temp1, temp2, temp3;
261 svuint8_t output0, output1, output2;
262 svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7;
263 svuint8x3_t st_dec_output;
264 svbool_t p8_all = svptrue_b8();
265 svbool_t pred;
266 uint64_t i = 0;
267 uint64_t pred_count = 0;
268 uint64_t N = (*src_len / 4) * 4;
269 svuint8x4_t ld_dec_input;
270
271 if (vl == 16) {
272 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
273 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16);
274 tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32);
275 tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48);
276 tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64);
277 tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80);
278 tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96);
279 tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112);
280
281 while (i < N) {
282 pred = svwhilelt_b8(i / 4, N / 4);
283
284 ld_dec_input = svld4_u8(pred, *src);
285
286 str0 = svget4_u8(ld_dec_input, 0);
287 str1 = svget4_u8(ld_dec_input, 1);
288 str2 = svget4_u8(ld_dec_input, 2);
289 str3 = svget4_u8(ld_dec_input, 3);
290
291 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
292 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
293 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
294 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
295
296 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
297 tbl_dec7, str0, &temp0, pred, vl)) { return; }
298 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
299 tbl_dec7, str1, &temp1, pred, vl)) { return; }
300 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
301 tbl_dec7, str2, &temp2, pred, vl)) { return; }
302 if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
303 tbl_dec7, str3, &temp3, pred, vl)) { return; }
304
305 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
306
307 st_dec_output = svcreate3_u8(output0, output1, output2);
308 svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
309
310 pred_count = svcntp_b8(pred, pred);
311 *src += pred_count * 4;
312 *dst = (uint8_t *)*dst + pred_count * 3;
313 *src_len -= pred_count * 4;
314 i += pred_count * 4;
315
316 }
317 } else if (vl == 32) {
318 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
319 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
320 tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2);
321 tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3);
322
323 while (i < N) {
324 pred = svwhilelt_b8(i / 4, N / 4);
325
326 ld_dec_input = svld4_u8(pred, *src);
327
328 str0 = svget4_u8(ld_dec_input, 0);
329 str1 = svget4_u8(ld_dec_input, 1);
330 str2 = svget4_u8(ld_dec_input, 2);
331 str3 = svget4_u8(ld_dec_input, 3);
332
333 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
334 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
335 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
336 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
337
338 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; }
339 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; }
340 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; }
341 if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; }
342
343 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
344
345 st_dec_output = svcreate3_u8(output0, output1, output2);
346 svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
347
348 pred_count = svcntp_b8(pred, pred);
349 *src += pred_count * 4;
350 *dst = (uint8_t *)*dst + pred_count * 3;
351 *src_len -= pred_count * 4;
352 i += pred_count * 4;
353
354 }
355
356 } else if (vl == 48) {
357 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
358 tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
359 pred = svwhilelt_b8(vl * 2, (uint64_t)128);
360 tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl);
361
362 while (i < N) {
363 pred = svwhilelt_b8(i / 4, N / 4);
364
365 ld_dec_input = svld4_u8(pred, *src);
366
367 str0 = svget4_u8(ld_dec_input, 0);
368 str1 = svget4_u8(ld_dec_input, 1);
369 str2 = svget4_u8(ld_dec_input, 2);
370 str3 = svget4_u8(ld_dec_input, 3);
371
372 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
373 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
374 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
375 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
376
377 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; }
378 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; }
379 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; }
380 if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; }
381
382 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
383
384 st_dec_output = svcreate3_u8(output0, output1, output2);
385 svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
386
387 pred_count = svcntp_b8(pred, pred);
388 *src += pred_count * 4;
389 *dst = (uint8_t *)*dst + pred_count * 3;
390 *src_len -= pred_count * 4;
391 i += pred_count * 4;
392
393 }
394 } else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) {
395 tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
396 pred = svwhilelt_b8(vl, (uint64_t)128);
397 tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl);
398
399 while (i < N) {
400 pred = svwhilelt_b8(i / 4, N / 4);
401
402 ld_dec_input = svld4_u8(pred, *src);
403
404 str0 = svget4_u8(ld_dec_input, 0);
405 str1 = svget4_u8(ld_dec_input, 1);
406 str2 = svget4_u8(ld_dec_input, 2);
407 str3 = svget4_u8(ld_dec_input, 3);
408
409 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
410 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
411 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
412 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
413
414 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; }
415 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; }
416 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; }
417 if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; }
418
419 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
420
421 st_dec_output = svcreate3_u8(output0, output1, output2);
422 svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
423
424 pred_count = svcntp_b8(pred, pred);
425 *src += pred_count * 4;
426 *dst = (uint8_t *)*dst + pred_count * 3;
427 *src_len -= pred_count * 4;
428 i += pred_count * 4;
429
430 }
431 } else if (vl >= 128) {
432 pred = svwhilelt_b8((uint64_t)0, (uint64_t)128);
433 tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0);
434
435 while (i < N) {
436 pred = svwhilelt_b8(i / 4, N / 4);
437
438 ld_dec_input = svld4_u8(pred, *src);
439
440 str0 = svget4_u8(ld_dec_input, 0);
441 str1 = svget4_u8(ld_dec_input, 1);
442 str2 = svget4_u8(ld_dec_input, 2);
443 str3 = svget4_u8(ld_dec_input, 3);
444
445 if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
446 if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
447 if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
448 if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
449
450 temp0 = svtbl_u8(tbl_dec0, str0);
451 temp1 = svtbl_u8(tbl_dec0, str1);
452 temp2 = svtbl_u8(tbl_dec0, str2);
453 temp3 = svtbl_u8(tbl_dec0, str3);
454
455 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; }
456 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; }
457 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; }
458 if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; }
459
460 convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
461
462 st_dec_output = svcreate3_u8(output0, output1, output2);
463 svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
464
465 pred_count = svcntp_b8(pred, pred);
466 *src += pred_count * 4;
467 *dst = (uint8_t *)*dst + pred_count * 3;
468 *src_len -= pred_count * 4;
469 i += pred_count * 4;
470
471 }
472 }
473 }
474