1ddf8904cSJim Harris /* SPDX-License-Identifier: BSD-2-Clause
2a6dbe372Spaul luse * Copyright (C) 2020 Intel Corporation.
3a41fb6e6SRichael Zhuang * Copyright (c) 2005-2007, Nick Galbreath
4a41fb6e6SRichael Zhuang * Copyright (c) 2013-2017, Alfred Klomp
5a41fb6e6SRichael Zhuang * Copyright (c) 2015-2017, Wojciech Mula
6a41fb6e6SRichael Zhuang * Copyright (c) 2016-2017, Matthieu Darbois
7a41fb6e6SRichael Zhuang * All rights reserved.
8a41fb6e6SRichael Zhuang */
9a41fb6e6SRichael Zhuang
10a41fb6e6SRichael Zhuang #ifndef __aarch64__
11a41fb6e6SRichael Zhuang #error Unsupported hardware
12a41fb6e6SRichael Zhuang #endif
13a41fb6e6SRichael Zhuang
14a41fb6e6SRichael Zhuang #include "spdk/stdinc.h"
15a41fb6e6SRichael Zhuang /*
16a41fb6e6SRichael Zhuang * Encoding
17a41fb6e6SRichael Zhuang * Use a 64-byte lookup to do the encoding.
18a41fb6e6SRichael Zhuang * Reuse existing base64_dec_table and base64_dec_table.
19a41fb6e6SRichael Zhuang
20a41fb6e6SRichael Zhuang * Decoding
21a41fb6e6SRichael Zhuang * The input consists of five valid character sets in the Base64 alphabet,
22a41fb6e6SRichael Zhuang * which we need to map back to the 6-bit values they represent.
23a41fb6e6SRichael Zhuang * There are three ranges, two singles, and then there's the rest.
24a41fb6e6SRichael Zhuang *
25a41fb6e6SRichael Zhuang * LUT1[0-63] = base64_dec_table_neon64[0-63]
26a41fb6e6SRichael Zhuang * LUT2[0-63] = base64_dec_table_neon64[64-127]
27a41fb6e6SRichael Zhuang * # From To LUT Characters
28a41fb6e6SRichael Zhuang * 1 [0..42] [255] #1 invalid input
29a41fb6e6SRichael Zhuang * 2 [43] [62] #1 +
30a41fb6e6SRichael Zhuang * 3 [44..46] [255] #1 invalid input
31a41fb6e6SRichael Zhuang * 4 [47] [63] #1 /
32a41fb6e6SRichael Zhuang * 5 [48..57] [52..61] #1 0..9
33a41fb6e6SRichael Zhuang * 6 [58..63] [255] #1 invalid input
34a41fb6e6SRichael Zhuang * 7 [64] [255] #2 invalid input
35a41fb6e6SRichael Zhuang * 8 [65..90] [0..25] #2 A..Z
36a41fb6e6SRichael Zhuang * 9 [91..96] [255] #2 invalid input
37a41fb6e6SRichael Zhuang * 10 [97..122] [26..51] #2 a..z
38a41fb6e6SRichael Zhuang * 11 [123..126] [255] #2 invalid input
39a41fb6e6SRichael Zhuang * (12) Everything else => invalid input
40a41fb6e6SRichael Zhuang */
41a41fb6e6SRichael Zhuang static const uint8_t base64_dec_table_neon64[] = {
42a41fb6e6SRichael Zhuang 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
43a41fb6e6SRichael Zhuang 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
44a41fb6e6SRichael Zhuang 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63,
45a41fb6e6SRichael Zhuang 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
46a41fb6e6SRichael Zhuang 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
47a41fb6e6SRichael Zhuang 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255,
48a41fb6e6SRichael Zhuang 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
49a41fb6e6SRichael Zhuang 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255
50a41fb6e6SRichael Zhuang };
51a41fb6e6SRichael Zhuang
52a41fb6e6SRichael Zhuang /*
53a41fb6e6SRichael Zhuang * LUT1[0-63] = base64_urlsafe_dec_table_neon64[0-63]
54a41fb6e6SRichael Zhuang * LUT2[0-63] = base64_urlsafe_dec_table_neon64[64-127]
55a41fb6e6SRichael Zhuang * # From To LUT Characters
56a41fb6e6SRichael Zhuang * 1 [0..44] [255] #1 invalid input
57a41fb6e6SRichael Zhuang * 2 [45] [62] #1 -
58a41fb6e6SRichael Zhuang * 3 [46..47] [255] #1 invalid input
59a41fb6e6SRichael Zhuang * 5 [48..57] [52..61] #1 0..9
60a41fb6e6SRichael Zhuang * 6 [58..63] [255] #1 invalid input
61a41fb6e6SRichael Zhuang * 7 [64] [255] #2 invalid input
62a41fb6e6SRichael Zhuang * 8 [65..90] [0..25] #2 A..Z
63a41fb6e6SRichael Zhuang * 9 [91..94] [255] #2 invalid input
64a41fb6e6SRichael Zhuang * 10 [95] [63] #2 _
65a41fb6e6SRichael Zhuang * 11 [96] [255] #2 invalid input
66a41fb6e6SRichael Zhuang * 12 [97..122] [26..51] #2 a..z
67a41fb6e6SRichael Zhuang * 13 [123..126] [255] #2 invalid input
68a41fb6e6SRichael Zhuang * (14) Everything else => invalid input
69a41fb6e6SRichael Zhuang */
70a41fb6e6SRichael Zhuang static const uint8_t base64_urlsafe_dec_table_neon64[] = {
71a41fb6e6SRichael Zhuang 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
72a41fb6e6SRichael Zhuang 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
73a41fb6e6SRichael Zhuang 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255,
74a41fb6e6SRichael Zhuang 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
75a41fb6e6SRichael Zhuang 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
76a41fb6e6SRichael Zhuang 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255,
77a41fb6e6SRichael Zhuang 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
78a41fb6e6SRichael Zhuang 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255
79a41fb6e6SRichael Zhuang };
80a41fb6e6SRichael Zhuang
81a41fb6e6SRichael Zhuang #include <arm_neon.h>
82a41fb6e6SRichael Zhuang #define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n))
83a41fb6e6SRichael Zhuang
84a41fb6e6SRichael Zhuang static inline uint8x16x4_t
load_64byte_table(const uint8_t * p)85a41fb6e6SRichael Zhuang load_64byte_table(const uint8_t *p)
86a41fb6e6SRichael Zhuang {
87a41fb6e6SRichael Zhuang uint8x16x4_t ret;
88a41fb6e6SRichael Zhuang ret.val[0] = vld1q_u8(p + 0);
89a41fb6e6SRichael Zhuang ret.val[1] = vld1q_u8(p + 16);
90a41fb6e6SRichael Zhuang ret.val[2] = vld1q_u8(p + 32);
91a41fb6e6SRichael Zhuang ret.val[3] = vld1q_u8(p + 48);
92a41fb6e6SRichael Zhuang return ret;
93a41fb6e6SRichael Zhuang }
94a41fb6e6SRichael Zhuang
95a41fb6e6SRichael Zhuang static void
base64_encode_neon64(char ** dst,const char * enc_table,const void ** src,size_t * src_len)9657c2b0c5SSeth Howell base64_encode_neon64(char **dst, const char *enc_table, const void **src, size_t *src_len)
97a41fb6e6SRichael Zhuang {
98a41fb6e6SRichael Zhuang const uint8x16x4_t tbl_enc = load_64byte_table(enc_table);
99a41fb6e6SRichael Zhuang
100a41fb6e6SRichael Zhuang while (*src_len >= 48) {
101a41fb6e6SRichael Zhuang uint8x16x3_t str;
102a41fb6e6SRichael Zhuang uint8x16x4_t res;
103a41fb6e6SRichael Zhuang
104a41fb6e6SRichael Zhuang /* Load 48 bytes and deinterleave */
105a41fb6e6SRichael Zhuang str = vld3q_u8((uint8_t *)*src);
106a41fb6e6SRichael Zhuang
107a41fb6e6SRichael Zhuang /* Divide bits of three input bytes over four output bytes and clear top two bits */
108a41fb6e6SRichael Zhuang res.val[0] = vshrq_n_u8(str.val[0], 2);
109a41fb6e6SRichael Zhuang res.val[1] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[1], 4), vshlq_n_u8(str.val[0], 4)),
110a41fb6e6SRichael Zhuang vdupq_n_u8(0x3F));
111a41fb6e6SRichael Zhuang res.val[2] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[2], 6), vshlq_n_u8(str.val[1], 2)),
112a41fb6e6SRichael Zhuang vdupq_n_u8(0x3F));
113a41fb6e6SRichael Zhuang res.val[3] = vandq_u8(str.val[2], vdupq_n_u8(0x3F));
114a41fb6e6SRichael Zhuang
115a41fb6e6SRichael Zhuang /*
116a41fb6e6SRichael Zhuang * The bits have now been shifted to the right locations;
117a41fb6e6SRichael Zhuang * translate their values 0..63 to the Base64 alphabet.
118a41fb6e6SRichael Zhuang * Use a 64-byte table lookup:
119a41fb6e6SRichael Zhuang */
120a41fb6e6SRichael Zhuang res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
121a41fb6e6SRichael Zhuang res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
122a41fb6e6SRichael Zhuang res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
123a41fb6e6SRichael Zhuang res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);
124a41fb6e6SRichael Zhuang
125a41fb6e6SRichael Zhuang /* Interleave and store result */
126a41fb6e6SRichael Zhuang vst4q_u8((uint8_t *)*dst, res);
127a41fb6e6SRichael Zhuang
128*075d422fSKonrad Sztyber *src = (uint8_t *)*src + 48; /* 3 * 16 bytes of input */
129a41fb6e6SRichael Zhuang *dst += 64; /* 4 * 16 bytes of output */
130a41fb6e6SRichael Zhuang *src_len -= 48;
131a41fb6e6SRichael Zhuang }
132a41fb6e6SRichael Zhuang }
133a41fb6e6SRichael Zhuang
134a41fb6e6SRichael Zhuang static void
base64_decode_neon64(void ** dst,const uint8_t * dec_table_neon64,const uint8_t ** src,size_t * src_len)13557c2b0c5SSeth Howell base64_decode_neon64(void **dst, const uint8_t *dec_table_neon64, const uint8_t **src,
136a41fb6e6SRichael Zhuang size_t *src_len)
137a41fb6e6SRichael Zhuang {
138a41fb6e6SRichael Zhuang /*
139a41fb6e6SRichael Zhuang * First LUT tbl_dec1 will use VTBL instruction (out of range indices are set to 0 in destination).
140a41fb6e6SRichael Zhuang * Second LUT tbl_dec2 will use VTBX instruction (out of range indices will be unchanged in destination).
141a41fb6e6SRichael Zhuang * Input [64..126] will be mapped to index [1..63] in tb1_dec2. Index 0 means that value comes from tb1_dec1.
142a41fb6e6SRichael Zhuang */
143a41fb6e6SRichael Zhuang const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_table_neon64);
144a41fb6e6SRichael Zhuang const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_table_neon64 + 64);
145a41fb6e6SRichael Zhuang const uint8x16_t offset = vdupq_n_u8(63U);
146a41fb6e6SRichael Zhuang
147a41fb6e6SRichael Zhuang while (*src_len >= 64) {
148a41fb6e6SRichael Zhuang
149a41fb6e6SRichael Zhuang uint8x16x4_t dec1, dec2;
150a41fb6e6SRichael Zhuang uint8x16x3_t dec;
151a41fb6e6SRichael Zhuang
152a41fb6e6SRichael Zhuang /* Load 64 bytes and deinterleave */
153a41fb6e6SRichael Zhuang uint8x16x4_t str = vld4q_u8((uint8_t *)*src);
154a41fb6e6SRichael Zhuang
155a41fb6e6SRichael Zhuang /* Get indices for 2nd LUT */
156a41fb6e6SRichael Zhuang dec2.val[0] = vqsubq_u8(str.val[0], offset);
157a41fb6e6SRichael Zhuang dec2.val[1] = vqsubq_u8(str.val[1], offset);
158a41fb6e6SRichael Zhuang dec2.val[2] = vqsubq_u8(str.val[2], offset);
159a41fb6e6SRichael Zhuang dec2.val[3] = vqsubq_u8(str.val[3], offset);
160a41fb6e6SRichael Zhuang
161a41fb6e6SRichael Zhuang /* Get values from 1st LUT */
162a41fb6e6SRichael Zhuang dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
163a41fb6e6SRichael Zhuang dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
164a41fb6e6SRichael Zhuang dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
165a41fb6e6SRichael Zhuang dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
166a41fb6e6SRichael Zhuang
167a41fb6e6SRichael Zhuang /* Get values from 2nd LUT */
168a41fb6e6SRichael Zhuang dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
169a41fb6e6SRichael Zhuang dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
170a41fb6e6SRichael Zhuang dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
171a41fb6e6SRichael Zhuang dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
172a41fb6e6SRichael Zhuang
173a41fb6e6SRichael Zhuang /* Get final values */
174a41fb6e6SRichael Zhuang str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
175a41fb6e6SRichael Zhuang str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
176a41fb6e6SRichael Zhuang str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
177a41fb6e6SRichael Zhuang str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
178a41fb6e6SRichael Zhuang
179a41fb6e6SRichael Zhuang /* Check for invalid input, any value larger than 63 */
180a41fb6e6SRichael Zhuang uint8x16_t classified = CMPGT(str.val[0], 63);
181a41fb6e6SRichael Zhuang classified = vorrq_u8(classified, CMPGT(str.val[1], 63));
182a41fb6e6SRichael Zhuang classified = vorrq_u8(classified, CMPGT(str.val[2], 63));
183a41fb6e6SRichael Zhuang classified = vorrq_u8(classified, CMPGT(str.val[3], 63));
184a41fb6e6SRichael Zhuang
185a41fb6e6SRichael Zhuang /* check that all bits are zero */
186a41fb6e6SRichael Zhuang if (vmaxvq_u8(classified) != 0U) {
187a41fb6e6SRichael Zhuang break;
188a41fb6e6SRichael Zhuang }
189a41fb6e6SRichael Zhuang
190a41fb6e6SRichael Zhuang /* Compress four bytes into three */
191a41fb6e6SRichael Zhuang dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
192a41fb6e6SRichael Zhuang dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
193a41fb6e6SRichael Zhuang dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
194a41fb6e6SRichael Zhuang
195a41fb6e6SRichael Zhuang /* Interleave and store decoded result */
196a41fb6e6SRichael Zhuang vst3q_u8((uint8_t *)*dst, dec);
197a41fb6e6SRichael Zhuang
198a41fb6e6SRichael Zhuang *src += 64;
199*075d422fSKonrad Sztyber *dst = (uint8_t *)*dst + 48;
200a41fb6e6SRichael Zhuang *src_len -= 64;
201a41fb6e6SRichael Zhuang }
202a41fb6e6SRichael Zhuang }
203