xref: /spdk/lib/json/json_parse.c (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/json.h"
7 
8 #include "spdk_internal/utf.h"
9 
10 #define SPDK_JSON_MAX_NESTING_DEPTH	64
11 
12 static int
hex_value(uint8_t c)13 hex_value(uint8_t c)
14 {
15 #define V(x, y) [x] = y + 1
16 	static const int8_t val[256] = {
17 		V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
18 		V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
19 		V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
20 		V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
21 	};
22 #undef V
23 
24 	return val[c] - 1;
25 }
26 
27 static int
json_decode_string_escape_unicode(uint8_t ** strp,uint8_t * buf_end,uint8_t * out)28 json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
29 {
30 	uint8_t *str = *strp;
31 	int v0, v1, v2, v3;
32 	uint32_t val;
33 	uint32_t surrogate_high = 0;
34 	int rc;
35 decode:
36 	/* \uXXXX */
37 	assert(buf_end > str);
38 
39 	if (*str++ != '\\') { return SPDK_JSON_PARSE_INVALID; }
40 	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
41 
42 	if (*str++ != 'u') { return SPDK_JSON_PARSE_INVALID; }
43 	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
44 
45 	if ((v3 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
46 	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
47 
48 	if ((v2 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
49 	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
50 
51 	if ((v1 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
52 	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
53 
54 	if ((v0 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
55 	if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
56 
57 	val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12);
58 
59 	if (surrogate_high) {
60 		/* We already parsed the high surrogate, so this should be the low part. */
61 		if (!utf16_valid_surrogate_low(val)) {
62 			return SPDK_JSON_PARSE_INVALID;
63 		}
64 
65 		/* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */
66 		val = utf16_decode_surrogate_pair(surrogate_high, val);
67 	} else if (utf16_valid_surrogate_high(val)) {
68 		surrogate_high = val;
69 
70 		/*
71 		 * We parsed a \uXXXX sequence that decoded to the first half of a
72 		 *  UTF-16 surrogate pair, so it must be immediately followed by another
73 		 *  \uXXXX escape.
74 		 *
75 		 * Loop around to get the low half of the surrogate pair.
76 		 */
77 		if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
78 		goto decode;
79 	} else if (utf16_valid_surrogate_low(val)) {
80 		/*
81 		 * We found the second half of surrogate pair without the first half;
82 		 *  this is an invalid encoding.
83 		 */
84 		return SPDK_JSON_PARSE_INVALID;
85 	}
86 
87 	/*
88 	 * Convert Unicode escape (or surrogate pair) to UTF-8 in place.
89 	 *
90 	 * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes
91 	 *  (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a
92 	 *  single codepoint is 4 bytes.
93 	 */
94 	if (out) {
95 		rc = utf8_encode_unsafe(out, val);
96 	} else {
97 		rc = utf8_codepoint_len(val);
98 	}
99 	if (rc < 0) {
100 		return SPDK_JSON_PARSE_INVALID;
101 	}
102 
103 	*strp = str; /* update input pointer */
104 	return rc; /* return number of bytes decoded */
105 }
106 
107 static int
json_decode_string_escape_twochar(uint8_t ** strp,uint8_t * buf_end,uint8_t * out)108 json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
109 {
110 	static const uint8_t escapes[256] = {
111 		['b'] = '\b',
112 		['f'] = '\f',
113 		['n'] = '\n',
114 		['r'] = '\r',
115 		['t'] = '\t',
116 		['/'] = '/',
117 		['"'] = '"',
118 		['\\'] = '\\',
119 	};
120 	uint8_t *str = *strp;
121 	uint8_t c;
122 
123 	assert(buf_end > str);
124 	if (buf_end - str < 2) {
125 		return SPDK_JSON_PARSE_INCOMPLETE;
126 	}
127 
128 	assert(str[0] == '\\');
129 
130 	c = escapes[str[1]];
131 	if (c) {
132 		if (out) {
133 			*out = c;
134 		}
135 		*strp += 2; /* consumed two bytes */
136 		return 1; /* produced one byte */
137 	}
138 
139 	return SPDK_JSON_PARSE_INVALID;
140 }
141 
142 /*
143  * Decode JSON string backslash escape.
144  * \param strp pointer to pointer to first character of escape (the backslash).
145  *  *strp is also advanced to indicate how much input was consumed.
146  *
147  * \return Number of bytes appended to out
148  */
149 static int
json_decode_string_escape(uint8_t ** strp,uint8_t * buf_end,uint8_t * out)150 json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
151 {
152 	int rc;
153 
154 	rc = json_decode_string_escape_twochar(strp, buf_end, out);
155 	if (rc > 0) {
156 		return rc;
157 	}
158 
159 	return json_decode_string_escape_unicode(strp, buf_end, out);
160 }
161 
162 /*
163  * Decode JSON string in place.
164  *
165  * \param str_start Pointer to the beginning of the string (the opening " character).
166  *
167  * \return Number of bytes in decoded string (beginning from start).
168  */
169 static int
json_decode_string(uint8_t * str_start,uint8_t * buf_end,uint8_t ** str_end,uint32_t flags)170 json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags)
171 {
172 	uint8_t *str = str_start;
173 	uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */
174 	int rc;
175 
176 	if (buf_end - str_start < 2) {
177 		/*
178 		 * Shortest valid string (the empty string) is two bytes (""),
179 		 *  so this can't possibly be valid
180 		 */
181 		*str_end = str;
182 		return SPDK_JSON_PARSE_INCOMPLETE;
183 	}
184 
185 	if (*str++ != '"') {
186 		*str_end = str;
187 		return SPDK_JSON_PARSE_INVALID;
188 	}
189 
190 	while (str < buf_end) {
191 		if (str[0] == '"') {
192 			/*
193 			 * End of string.
194 			 * Update str_end to point at next input byte and return output length.
195 			 */
196 			*str_end = str + 1;
197 			return out - str_start - 1;
198 		} else if (str[0] == '\\') {
199 			rc = json_decode_string_escape(&str, buf_end,
200 						       flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL);
201 			assert(rc != 0);
202 			if (rc < 0) {
203 				*str_end = str;
204 				return rc;
205 			}
206 			out += rc;
207 		} else if (str[0] <= 0x1f) {
208 			/* control characters must be escaped */
209 			*str_end = str;
210 			return SPDK_JSON_PARSE_INVALID;
211 		} else {
212 			rc = utf8_valid(str, buf_end);
213 			if (rc == 0) {
214 				*str_end = str;
215 				return SPDK_JSON_PARSE_INCOMPLETE;
216 			} else if (rc < 0) {
217 				*str_end = str;
218 				return SPDK_JSON_PARSE_INVALID;
219 			}
220 
221 			if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) {
222 				memmove(out, str, rc);
223 			}
224 			out += rc;
225 			str += rc;
226 		}
227 	}
228 
229 	/* If execution gets here, we ran out of buffer. */
230 	*str_end = str;
231 	return SPDK_JSON_PARSE_INCOMPLETE;
232 }
233 
234 static int
json_valid_number(uint8_t * start,uint8_t * buf_end)235 json_valid_number(uint8_t *start, uint8_t *buf_end)
236 {
237 	uint8_t *p = start;
238 	uint8_t c;
239 
240 	if (p >= buf_end) { return -1; }
241 
242 	c = *p++;
243 	if (c >= '1' && c <= '9') { goto num_int_digits; }
244 	if (c == '0') { goto num_frac_or_exp; }
245 	if (c == '-') { goto num_int_first_digit; }
246 	p--;
247 	goto done_invalid;
248 
249 num_int_first_digit:
250 	if (spdk_likely(p != buf_end)) {
251 		c = *p++;
252 		if (c == '0') { goto num_frac_or_exp; }
253 		if (c >= '1' && c <= '9') { goto num_int_digits; }
254 		p--;
255 	}
256 	goto done_invalid;
257 
258 num_int_digits:
259 	if (spdk_likely(p != buf_end)) {
260 		c = *p++;
261 		if (c >= '0' && c <= '9') { goto num_int_digits; }
262 		if (c == '.') { goto num_frac_first_digit; }
263 		if (c == 'e' || c == 'E') { goto num_exp_sign; }
264 		p--;
265 	}
266 	goto done_valid;
267 
268 num_frac_or_exp:
269 	if (spdk_likely(p != buf_end)) {
270 		c = *p++;
271 		if (c == '.') { goto num_frac_first_digit; }
272 		if (c == 'e' || c == 'E') { goto num_exp_sign; }
273 		p--;
274 	}
275 	goto done_valid;
276 
277 num_frac_first_digit:
278 	if (spdk_likely(p != buf_end)) {
279 		c = *p++;
280 		if (c >= '0' && c <= '9') { goto num_frac_digits; }
281 		p--;
282 	}
283 	goto done_invalid;
284 
285 num_frac_digits:
286 	if (spdk_likely(p != buf_end)) {
287 		c = *p++;
288 		if (c >= '0' && c <= '9') { goto num_frac_digits; }
289 		if (c == 'e' || c == 'E') { goto num_exp_sign; }
290 		p--;
291 	}
292 	goto done_valid;
293 
294 num_exp_sign:
295 	if (spdk_likely(p != buf_end)) {
296 		c = *p++;
297 		if (c >= '0' && c <= '9') { goto num_exp_digits; }
298 		if (c == '-' || c == '+') { goto num_exp_first_digit; }
299 		p--;
300 	}
301 	goto done_invalid;
302 
303 num_exp_first_digit:
304 	if (spdk_likely(p != buf_end)) {
305 		c = *p++;
306 		if (c >= '0' && c <= '9') { goto num_exp_digits; }
307 		p--;
308 	}
309 	goto done_invalid;
310 
311 num_exp_digits:
312 	if (spdk_likely(p != buf_end)) {
313 		c = *p++;
314 		if (c >= '0' && c <= '9') { goto num_exp_digits; }
315 		p--;
316 	}
317 	goto done_valid;
318 
319 done_valid:
320 	/* Valid end state */
321 	return p - start;
322 
323 done_invalid:
324 	/* Invalid end state */
325 	if (p == buf_end) {
326 		/* Hit the end of the buffer - the stream is incomplete. */
327 		return SPDK_JSON_PARSE_INCOMPLETE;
328 	}
329 
330 	/* Found an invalid character in an invalid end state */
331 	return SPDK_JSON_PARSE_INVALID;
332 }
333 
334 static int
json_valid_comment(const uint8_t * start,const uint8_t * buf_end)335 json_valid_comment(const uint8_t *start, const uint8_t *buf_end)
336 {
337 	const uint8_t *p = start;
338 	bool multiline;
339 
340 	assert(buf_end > p);
341 	if (buf_end - p < 2) {
342 		return SPDK_JSON_PARSE_INCOMPLETE;
343 	}
344 
345 	if (p[0] != '/') {
346 		return SPDK_JSON_PARSE_INVALID;
347 	}
348 	if (p[1] == '*') {
349 		multiline = true;
350 	} else if (p[1] == '/') {
351 		multiline = false;
352 	} else {
353 		return SPDK_JSON_PARSE_INVALID;
354 	}
355 	p += 2;
356 
357 	if (multiline) {
358 		while (p != buf_end - 1) {
359 			if (p[0] == '*' && p[1] == '/') {
360 				/* Include the terminating star and slash in the comment */
361 				return p - start + 2;
362 			}
363 			p++;
364 		}
365 	} else {
366 		while (p != buf_end) {
367 			if (*p == '\r' || *p == '\n') {
368 				/* Do not include the line terminator in the comment */
369 				return p - start;
370 			}
371 			p++;
372 		}
373 	}
374 
375 	return SPDK_JSON_PARSE_INCOMPLETE;
376 }
377 
378 struct json_literal {
379 	enum spdk_json_val_type type;
380 	uint32_t len;
381 	uint8_t str[8];
382 };
383 
384 /*
385  * JSON only defines 3 possible literals; they can be uniquely identified by bits
386  *  3 and 4 of the first character:
387  *   'f' = 0b11[00]110
388  *   'n' = 0b11[01]110
389  *   't' = 0b11[10]100
390  * These two bits can be used as an index into the g_json_literals array.
391  */
392 static const struct json_literal g_json_literals[] = {
393 	{SPDK_JSON_VAL_FALSE, 5, "false"},
394 	{SPDK_JSON_VAL_NULL,  4, "null"},
395 	{SPDK_JSON_VAL_TRUE,  4, "true"},
396 	{}
397 };
398 
399 static int
match_literal(const uint8_t * start,const uint8_t * end,const uint8_t * literal,size_t len)400 match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len)
401 {
402 	assert(end >= start);
403 	if ((size_t)(end - start) < len) {
404 		return SPDK_JSON_PARSE_INCOMPLETE;
405 	}
406 
407 	if (memcmp(start, literal, len) != 0) {
408 		return SPDK_JSON_PARSE_INVALID;
409 	}
410 
411 	return len;
412 }
413 
414 ssize_t
spdk_json_parse(void * json,size_t size,struct spdk_json_val * values,size_t num_values,void ** end,uint32_t flags)415 spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values,
416 		void **end, uint32_t flags)
417 {
418 	uint8_t *json_end = json + size;
419 	enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH];
420 	size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH];
421 	enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID;
422 	bool trailing_comma = false;
423 	size_t depth = 0; /* index into containers */
424 	size_t cur_value = 0; /* index into values */
425 	size_t con_start_value;
426 	uint8_t *data = json;
427 	uint8_t *new_data;
428 	int rc = 0;
429 	const struct json_literal *lit;
430 	enum {
431 		STATE_VALUE, /* initial state */
432 		STATE_VALUE_SEPARATOR, /* value separator (comma) */
433 		STATE_NAME, /* "name": value */
434 		STATE_NAME_SEPARATOR, /* colon */
435 		STATE_END, /* parsed the complete value, so only whitespace is valid */
436 	} state = STATE_VALUE;
437 
438 #define ADD_VALUE(t, val_start_ptr, val_end_ptr) \
439 	if (values && cur_value < num_values) { \
440 		values[cur_value].type = t; \
441 		values[cur_value].start = val_start_ptr; \
442 		values[cur_value].len = val_end_ptr - val_start_ptr; \
443 	} \
444 	cur_value++
445 
446 	while (data < json_end) {
447 		uint8_t c = *data;
448 
449 		switch (c) {
450 		case ' ':
451 		case '\t':
452 		case '\r':
453 		case '\n':
454 			/* Whitespace is allowed between any tokens. */
455 			data++;
456 			break;
457 
458 		case 't':
459 		case 'f':
460 		case 'n':
461 			/* true, false, or null */
462 			if (state != STATE_VALUE) { goto done_invalid; }
463 			lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */
464 			assert(lit->str[0] == c);
465 			rc = match_literal(data, json_end, lit->str, lit->len);
466 			if (rc < 0) { goto done_rc; }
467 			ADD_VALUE(lit->type, data, data + rc);
468 			data += rc;
469 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
470 			trailing_comma = false;
471 			break;
472 
473 		case '"':
474 			if (state != STATE_VALUE && state != STATE_NAME) { goto done_invalid; }
475 			rc = json_decode_string(data, json_end, &new_data, flags);
476 			if (rc < 0) {
477 				data = new_data;
478 				goto done_rc;
479 			}
480 			/*
481 			 * Start is data + 1 to skip initial quote.
482 			 * Length is data + rc - 1 to skip both quotes.
483 			 */
484 			ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME,
485 				  data + 1, data + rc - 1);
486 			data = new_data;
487 			if (state == STATE_NAME) {
488 				state = STATE_NAME_SEPARATOR;
489 			} else {
490 				state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
491 			}
492 			trailing_comma = false;
493 			break;
494 
495 		case '-':
496 		case '0':
497 		case '1':
498 		case '2':
499 		case '3':
500 		case '4':
501 		case '5':
502 		case '6':
503 		case '7':
504 		case '8':
505 		case '9':
506 			if (state != STATE_VALUE) { goto done_invalid; }
507 			rc = json_valid_number(data, json_end);
508 			if (rc < 0) { goto done_rc; }
509 			ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc);
510 			data += rc;
511 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
512 			trailing_comma = false;
513 			break;
514 
515 		case '{':
516 		case '[':
517 			if (state != STATE_VALUE) { goto done_invalid; }
518 			if (depth == SPDK_JSON_MAX_NESTING_DEPTH) {
519 				rc = SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED;
520 				goto done_rc;
521 			}
522 			if (c == '{') {
523 				con_type = SPDK_JSON_VAL_OBJECT_BEGIN;
524 				state = STATE_NAME;
525 			} else {
526 				con_type = SPDK_JSON_VAL_ARRAY_BEGIN;
527 				state = STATE_VALUE;
528 			}
529 			con_value[depth] = cur_value;
530 			containers[depth++] = con_type;
531 			ADD_VALUE(con_type, data, data + 1);
532 			data++;
533 			trailing_comma = false;
534 			break;
535 
536 		case '}':
537 		case ']':
538 			if (trailing_comma) { goto done_invalid; }
539 			if (depth == 0) { goto done_invalid; }
540 			con_type = containers[--depth];
541 			con_start_value = con_value[depth];
542 			if (values && con_start_value < num_values) {
543 				values[con_start_value].len = cur_value - con_start_value - 1;
544 			}
545 			if (c == '}') {
546 				if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) {
547 					goto done_invalid;
548 				}
549 				if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) {
550 					goto done_invalid;
551 				}
552 				ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1);
553 			} else {
554 				if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) {
555 					goto done_invalid;
556 				}
557 				if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) {
558 					goto done_invalid;
559 				}
560 				ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1);
561 			}
562 			con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1];
563 			data++;
564 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
565 			trailing_comma = false;
566 			break;
567 
568 		case ',':
569 			if (state != STATE_VALUE_SEPARATOR) { goto done_invalid; }
570 			data++;
571 			assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN ||
572 			       con_type == SPDK_JSON_VAL_OBJECT_BEGIN);
573 			state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME;
574 			trailing_comma = true;
575 			break;
576 
577 		case ':':
578 			if (state != STATE_NAME_SEPARATOR) { goto done_invalid; }
579 			data++;
580 			state = STATE_VALUE;
581 			break;
582 
583 		case '/':
584 			if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) {
585 				goto done_invalid;
586 			}
587 			rc = json_valid_comment(data, json_end);
588 			if (rc < 0) { goto done_rc; }
589 			/* Skip over comment */
590 			data += rc;
591 			break;
592 
593 		default:
594 			goto done_invalid;
595 		}
596 
597 		if (state == STATE_END) {
598 			break;
599 		}
600 	}
601 
602 	if (state == STATE_END) {
603 		/* Skip trailing whitespace */
604 		while (data < json_end) {
605 			uint8_t c = *data;
606 
607 			if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
608 				data++;
609 			} else {
610 				break;
611 			}
612 		}
613 
614 		/*
615 		 * These asserts are just for sanity checking - they are guaranteed by the allowed
616 		 *  state transitions.
617 		 */
618 		assert(depth == 0);
619 		assert(trailing_comma == false);
620 		assert(data <= json_end);
621 		if (end) {
622 			*end = data;
623 		}
624 		return cur_value;
625 	}
626 
627 	/* Invalid end state - ran out of data */
628 	rc = SPDK_JSON_PARSE_INCOMPLETE;
629 
630 done_rc:
631 	assert(rc < 0);
632 	if (end) {
633 		*end = data;
634 	}
635 	return rc;
636 
637 done_invalid:
638 	rc = SPDK_JSON_PARSE_INVALID;
639 	goto done_rc;
640 }
641