xref: /spdk/lib/json/json_parse.c (revision edbca2a67610cf6ebb9755c839e307d1d18375da)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "json_internal.h"
35 
36 static int
37 hex_value(uint8_t c)
38 {
39 #define V(x, y) [x] = y + 1
40 	static const int8_t val[256] = {
41 		V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
42 		V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
43 		V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
44 		V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
45 	};
46 #undef V
47 
48 	return val[c] - 1;
49 }
50 
51 static int
52 json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
53 {
54 	uint8_t *str = *strp;
55 	int v0, v1, v2, v3;
56 	uint32_t val;
57 	uint32_t surrogate_high = 0;
58 	int rc;
59 decode:
60 	/* \uXXXX */
61 	assert(buf_end > str);
62 
63 	if (*str++ != '\\') return SPDK_JSON_PARSE_INVALID;
64 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
65 
66 	if (*str++ != 'u') return SPDK_JSON_PARSE_INVALID;
67 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
68 
69 	if ((v3 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
70 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
71 
72 	if ((v2 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
73 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
74 
75 	if ((v1 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
76 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
77 
78 	if ((v0 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
79 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
80 
81 	val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12);
82 
83 	if (surrogate_high) {
84 		/* We already parsed the high surrogate, so this should be the low part. */
85 		if (!utf16_valid_surrogate_low(val)) {
86 			return SPDK_JSON_PARSE_INVALID;
87 		}
88 
89 		/* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */
90 		val = utf16_decode_surrogate_pair(surrogate_high, val);
91 	} else if (utf16_valid_surrogate_high(val)) {
92 		surrogate_high = val;
93 
94 		/*
95 		 * We parsed a \uXXXX sequence that decoded to the first half of a
96 		 *  UTF-16 surrogate pair, so it must be immediately followed by another
97 		 *  \uXXXX escape.
98 		 *
99 		 * Loop around to get the low half of the surrogate pair.
100 		 */
101 		if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
102 		goto decode;
103 	} else if (utf16_valid_surrogate_low(val)) {
104 		/*
105 		 * We found the second half of surrogate pair without the first half;
106 		 *  this is an invalid encoding.
107 		 */
108 		return SPDK_JSON_PARSE_INVALID;
109 	}
110 
111 	/*
112 	 * Convert Unicode escape (or surrogate pair) to UTF-8 in place.
113 	 *
114 	 * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes
115 	 *  (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a
116 	 *  single codepoint is 4 bytes.
117 	 */
118 	if (out) {
119 		rc = utf8_encode_unsafe(out, val);
120 	} else {
121 		rc = utf8_codepoint_len(val);
122 	}
123 	if (rc < 0) {
124 		return SPDK_JSON_PARSE_INVALID;
125 	}
126 
127 	*strp = str; /* update input pointer */
128 	return rc; /* return number of bytes decoded */
129 }
130 
131 static int
132 json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
133 {
134 	static const uint8_t escapes[256] = {
135 		['b'] = '\b',
136 		['f'] = '\f',
137 		['n'] = '\n',
138 		['r'] = '\r',
139 		['t'] = '\t',
140 		['/'] = '/',
141 		['"'] = '"',
142 		['\\'] = '\\',
143 	};
144 	uint8_t *str = *strp;
145 	uint8_t c;
146 
147 	assert(buf_end > str);
148 	if (buf_end - str < 2) {
149 		return SPDK_JSON_PARSE_INCOMPLETE;
150 	}
151 
152 	assert(str[0] == '\\');
153 
154 	c = escapes[str[1]];
155 	if (c) {
156 		if (out) {
157 			*out = c;
158 		}
159 		*strp += 2; /* consumed two bytes */
160 		return 1; /* produced one byte */
161 	}
162 
163 	return SPDK_JSON_PARSE_INVALID;
164 }
165 
166 /*
167  * Decode JSON string backslash escape.
168  * \param strp pointer to pointer to first character of escape (the backslash).
169  *  *strp is also advanced to indicate how much input was consumed.
170  *
171  * \return Number of bytes appended to out
172  */
173 static int
174 json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
175 {
176 	int rc;
177 
178 	rc = json_decode_string_escape_twochar(strp, buf_end, out);
179 	if (rc > 0) {
180 		return rc;
181 	}
182 
183 	return json_decode_string_escape_unicode(strp, buf_end, out);
184 }
185 
186 /*
187  * Decode JSON string in place.
188  *
189  * \param str_start Pointer to the beginning of the string (the opening " character).
190  *
191  * \return Number of bytes in decoded string (beginning from start).
192  */
193 static int
194 json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags)
195 {
196 	uint8_t *str = str_start;
197 	uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */
198 	int rc;
199 
200 	if (buf_end - str_start < 2) {
201 		/*
202 		 * Shortest valid string (the empty string) is two bytes (""),
203 		 *  so this can't possibly be valid
204 		 */
205 		*str_end = str;
206 		return SPDK_JSON_PARSE_INCOMPLETE;
207 	}
208 
209 	if (*str++ != '"') {
210 		*str_end = str;
211 		return SPDK_JSON_PARSE_INVALID;
212 	}
213 
214 	while (str < buf_end) {
215 		if (str[0] == '"') {
216 			/*
217 			 * End of string.
218 			 * Update str_end to point at next input byte and return output length.
219 			 */
220 			*str_end = str + 1;
221 			return out - str_start - 1;
222 		} else if (str[0] == '\\') {
223 			rc = json_decode_string_escape(&str, buf_end,
224 						       flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL);
225 			assert(rc != 0);
226 			if (rc < 0) {
227 				*str_end = str;
228 				return rc;
229 			}
230 			out += rc;
231 		} else if (str[0] <= 0x1f) {
232 			/* control characters must be escaped */
233 			*str_end = str;
234 			return SPDK_JSON_PARSE_INVALID;
235 		} else {
236 			rc = utf8_valid(str, buf_end);
237 			if (rc == 0) {
238 				*str_end = str;
239 				return SPDK_JSON_PARSE_INCOMPLETE;
240 			} else if (rc < 0) {
241 				*str_end = str;
242 				return SPDK_JSON_PARSE_INVALID;
243 			}
244 
245 			if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) {
246 				memmove(out, str, rc);
247 			}
248 			out += rc;
249 			str += rc;
250 		}
251 	}
252 
253 	/* If execution gets here, we ran out of buffer. */
254 	*str_end = str;
255 	return SPDK_JSON_PARSE_INCOMPLETE;
256 }
257 
258 static int
259 json_valid_number(uint8_t *start, uint8_t *buf_end)
260 {
261 	uint8_t *p = start;
262 	uint8_t c;
263 
264 	if (p >= buf_end) return -1;
265 
266 	c = *p++;
267 	if (c >= '1' && c <= '9') goto num_int_digits;
268 	if (c == '0') goto num_frac_or_exp;
269 	if (c == '-') goto num_int_first_digit;
270 	p--;
271 	goto done_invalid;
272 
273 num_int_first_digit:
274 	if (spdk_likely(p != buf_end)) {
275 		c = *p++;
276 		if (c == '0') goto num_frac_or_exp;
277 		if (c >= '1' && c <= '9') goto num_int_digits;
278 		p--;
279 	}
280 	goto done_invalid;
281 
282 num_int_digits:
283 	if (spdk_likely(p != buf_end)) {
284 		c = *p++;
285 		if (c >= '0' && c <= '9') goto num_int_digits;
286 		if (c == '.') goto num_frac_first_digit;
287 		if (c == 'e' || c == 'E') goto num_exp_sign;
288 		p--;
289 	}
290 	goto done_valid;
291 
292 num_frac_or_exp:
293 	if (spdk_likely(p != buf_end)) {
294 		c = *p++;
295 		if (c == '.') goto num_frac_first_digit;
296 		if (c == 'e' || c == 'E') goto num_exp_sign;
297 		p--;
298 	}
299 	goto done_valid;
300 
301 num_frac_first_digit:
302 	if (spdk_likely(p != buf_end)) {
303 		c = *p++;
304 		if (c >= '0' && c <= '9') goto num_frac_digits;
305 		p--;
306 	}
307 	goto done_invalid;
308 
309 num_frac_digits:
310 	if (spdk_likely(p != buf_end)) {
311 		c = *p++;
312 		if (c >= '0' && c <= '9') goto num_frac_digits;
313 		if (c == 'e' || c == 'E') goto num_exp_sign;
314 		p--;
315 	}
316 	goto done_valid;
317 
318 num_exp_sign:
319 	if (spdk_likely(p != buf_end)) {
320 		c = *p++;
321 		if (c >= '0' && c <= '9') goto num_exp_digits;
322 		if (c == '-' || c == '+') goto num_exp_first_digit;
323 		p--;
324 	}
325 	goto done_invalid;
326 
327 num_exp_first_digit:
328 	if (spdk_likely(p != buf_end)) {
329 		c = *p++;
330 		if (c >= '0' && c <= '9') goto num_exp_digits;
331 		p--;
332 	}
333 	goto done_invalid;
334 
335 num_exp_digits:
336 	if (spdk_likely(p != buf_end)) {
337 		c = *p++;
338 		if (c >= '0' && c <= '9') goto num_exp_digits;
339 		p--;
340 	}
341 	goto done_valid;
342 
343 done_valid:
344 	/* Valid end state */
345 	return p - start;
346 
347 done_invalid:
348 	/* Invalid end state */
349 	if (p == buf_end) {
350 		/* Hit the end of the buffer - the stream is incomplete. */
351 		return SPDK_JSON_PARSE_INCOMPLETE;
352 	}
353 
354 	/* Found an invalid character in an invalid end state */
355 	return SPDK_JSON_PARSE_INVALID;
356 }
357 
358 static int
359 json_valid_comment(const uint8_t *start, const uint8_t *buf_end)
360 {
361 	const uint8_t *p = start;
362 	bool multiline;
363 
364 	assert(buf_end > p);
365 	if (buf_end - p < 2) {
366 		return SPDK_JSON_PARSE_INCOMPLETE;
367 	}
368 
369 	if (p[0] != '/') {
370 		return SPDK_JSON_PARSE_INVALID;
371 	}
372 	if (p[1] == '*') {
373 		multiline = true;
374 	} else if (p[1] == '/') {
375 		multiline = false;
376 	} else {
377 		return SPDK_JSON_PARSE_INVALID;
378 	}
379 	p += 2;
380 
381 	if (multiline) {
382 		while (p != buf_end - 1) {
383 			if (p[0] == '*' && p[1] == '/') {
384 				/* Include the terminating star and slash in the comment */
385 				return p - start + 2;
386 			}
387 			p++;
388 		}
389 	} else {
390 		while (p != buf_end) {
391 			if (*p == '\r' || *p == '\n') {
392 				/* Do not include the line terminator in the comment */
393 				return p - start;
394 			}
395 			p++;
396 		}
397 	}
398 
399 	return SPDK_JSON_PARSE_INCOMPLETE;
400 }
401 
402 struct json_literal {
403 	enum spdk_json_val_type type;
404 	uint32_t len;
405 	uint8_t str[8];
406 };
407 
408 /*
409  * JSON only defines 3 possible literals; they can be uniquely identified by bits
410  *  3 and 4 of the first character:
411  *   'f' = 0b11[00]110
412  *   'n' = 0b11[01]110
413  *   't' = 0b11[10]100
414  * These two bits can be used as an index into the g_json_literals array.
415  */
416 static const struct json_literal g_json_literals[] = {
417 	{SPDK_JSON_VAL_FALSE, 5, "false"},
418 	{SPDK_JSON_VAL_NULL,  4, "null"},
419 	{SPDK_JSON_VAL_TRUE,  4, "true"},
420 	{}
421 };
422 
423 static int
424 match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len)
425 {
426 	assert(end >= start);
427 	if ((size_t)(end - start) < len) {
428 		return SPDK_JSON_PARSE_INCOMPLETE;
429 	}
430 
431 	if (memcmp(start, literal, len) != 0) {
432 		return SPDK_JSON_PARSE_INVALID;
433 	}
434 
435 	return len;
436 }
437 
438 ssize_t
439 spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values,
440 		void **end, uint32_t flags)
441 {
442 	uint8_t *json_end = json + size;
443 	enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH];
444 	size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH];
445 	enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID;
446 	bool trailing_comma = false;
447 	size_t depth = 0; /* index into containers */
448 	size_t cur_value = 0; /* index into values */
449 	size_t con_start_value;
450 	uint8_t *data = json;
451 	uint8_t *new_data;
452 	int rc = 0;
453 	const struct json_literal *lit;
454 	enum {
455 		STATE_VALUE, /* initial state */
456 		STATE_VALUE_SEPARATOR, /* value separator (comma) */
457 		STATE_NAME, /* "name": value */
458 		STATE_NAME_SEPARATOR, /* colon */
459 		STATE_END, /* parsed the complete value, so only whitespace is valid */
460 	} state = STATE_VALUE;
461 
462 #define ADD_VALUE(t, val_start_ptr, val_end_ptr) \
463 	if (values && cur_value < num_values) { \
464 		values[cur_value].type = t; \
465 		values[cur_value].start = val_start_ptr; \
466 		values[cur_value].len = val_end_ptr - val_start_ptr; \
467 	} \
468 	cur_value++
469 
470 	while (data < json_end) {
471 		uint8_t c = *data;
472 
473 		switch (c) {
474 		case ' ':
475 		case '\t':
476 		case '\r':
477 		case '\n':
478 			/* Whitespace is allowed between any tokens. */
479 			data++;
480 			break;
481 
482 		case 't':
483 		case 'f':
484 		case 'n':
485 			/* true, false, or null */
486 			if (state != STATE_VALUE) goto done_invalid;
487 			lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */
488 			assert(lit->str[0] == c);
489 			rc = match_literal(data, json_end, lit->str, lit->len);
490 			if (rc < 0) goto done_rc;
491 			ADD_VALUE(lit->type, data, data + rc);
492 			data += rc;
493 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
494 			trailing_comma = false;
495 			break;
496 
497 		case '"':
498 			if (state != STATE_VALUE && state != STATE_NAME) goto done_invalid;
499 			rc = json_decode_string(data, json_end, &new_data, flags);
500 			if (rc < 0) {
501 				data = new_data;
502 				goto done_rc;
503 			}
504 			/*
505 			 * Start is data + 1 to skip initial quote.
506 			 * Length is data + rc - 1 to skip both quotes.
507 			 */
508 			ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME,
509 				  data + 1, data + rc - 1);
510 			data = new_data;
511 			if (state == STATE_NAME) {
512 				state = STATE_NAME_SEPARATOR;
513 			} else {
514 				state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
515 			}
516 			trailing_comma = false;
517 			break;
518 
519 		case '-':
520 		case '0':
521 		case '1':
522 		case '2':
523 		case '3':
524 		case '4':
525 		case '5':
526 		case '6':
527 		case '7':
528 		case '8':
529 		case '9':
530 			if (state != STATE_VALUE) goto done_invalid;
531 			rc = json_valid_number(data, json_end);
532 			if (rc < 0) goto done_rc;
533 			ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc);
534 			data += rc;
535 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
536 			trailing_comma = false;
537 			break;
538 
539 		case '{':
540 		case '[':
541 			if (state != STATE_VALUE) goto done_invalid;
542 			if (depth == SPDK_JSON_MAX_NESTING_DEPTH) {
543 				rc = SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED;
544 				goto done_rc;
545 			}
546 			if (c == '{') {
547 				con_type = SPDK_JSON_VAL_OBJECT_BEGIN;
548 				state = STATE_NAME;
549 			} else {
550 				con_type = SPDK_JSON_VAL_ARRAY_BEGIN;
551 				state = STATE_VALUE;
552 			}
553 			con_value[depth] = cur_value;
554 			containers[depth++] = con_type;
555 			ADD_VALUE(con_type, data, data + 1);
556 			data++;
557 			trailing_comma = false;
558 			break;
559 
560 		case '}':
561 		case ']':
562 			if (trailing_comma) goto done_invalid;
563 			if (depth == 0) goto done_invalid;
564 			con_type = containers[--depth];
565 			con_start_value = con_value[depth];
566 			if (values && con_start_value < num_values) {
567 				values[con_start_value].len = cur_value - con_start_value - 1;
568 			}
569 			if (c == '}') {
570 				if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) {
571 					goto done_invalid;
572 				}
573 				if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) {
574 					goto done_invalid;
575 				}
576 				ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1);
577 			} else {
578 				if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) {
579 					goto done_invalid;
580 				}
581 				if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) {
582 					goto done_invalid;
583 				}
584 				ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1);
585 			}
586 			con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1];
587 			data++;
588 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
589 			trailing_comma = false;
590 			break;
591 
592 		case ',':
593 			if (state != STATE_VALUE_SEPARATOR) goto done_invalid;
594 			data++;
595 			assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN ||
596 			       con_type == SPDK_JSON_VAL_OBJECT_BEGIN);
597 			state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME;
598 			trailing_comma = true;
599 			break;
600 
601 		case ':':
602 			if (state != STATE_NAME_SEPARATOR) goto done_invalid;
603 			data++;
604 			state = STATE_VALUE;
605 			break;
606 
607 		case '/':
608 			if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) {
609 				goto done_invalid;
610 			}
611 			rc = json_valid_comment(data, json_end);
612 			if (rc < 0) goto done_rc;
613 			/* Skip over comment */
614 			data += rc;
615 			break;
616 
617 		default:
618 			goto done_invalid;
619 		}
620 
621 		if (state == STATE_END) {
622 			break;
623 		}
624 	}
625 
626 	if (state == STATE_END) {
627 		/* Skip trailing whitespace */
628 		while (data < json_end) {
629 			uint8_t c = *data;
630 
631 			if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
632 				data++;
633 			} else {
634 				break;
635 			}
636 		}
637 
638 		/*
639 		 * These asserts are just for sanity checking - they are guaranteed by the allowed
640 		 *  state transitions.
641 		 */
642 		assert(depth == 0);
643 		assert(trailing_comma == false);
644 		assert(data <= json_end);
645 		if (end) {
646 			*end = data;
647 		}
648 		return cur_value;
649 	}
650 
651 	/* Invalid end state - ran out of data */
652 	rc = SPDK_JSON_PARSE_INCOMPLETE;
653 
654 done_rc:
655 	assert(rc < 0);
656 	if (end) {
657 		*end = data;
658 	}
659 	return rc;
660 
661 done_invalid:
662 	rc = SPDK_JSON_PARSE_INVALID;
663 	goto done_rc;
664 }
665