xref: /spdk/lib/json/json_parse.c (revision 376d117c90d185e20b31062cd773e7af1eb14bae)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "json_internal.h"
35 
36 static int
37 hex_value(uint8_t c)
38 {
39 #define V(x, y) [x] = y + 1
40 	static const int8_t val[256] = {
41 		V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
42 		V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
43 		V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
44 		V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
45 	};
46 #undef V
47 
48 	return val[c] - 1;
49 }
50 
51 static int
52 json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
53 {
54 	uint8_t *str = *strp;
55 	int v0, v1, v2, v3;
56 	uint32_t val;
57 	uint32_t surrogate_high = 0;
58 	int rc;
59 decode:
60 	/* \uXXXX */
61 	assert(buf_end > str);
62 
63 	if (*str++ != '\\') return SPDK_JSON_PARSE_INVALID;
64 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
65 
66 	if (*str++ != 'u') return SPDK_JSON_PARSE_INVALID;
67 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
68 
69 	if ((v3 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
70 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
71 
72 	if ((v2 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
73 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
74 
75 	if ((v1 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
76 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
77 
78 	if ((v0 = hex_value(*str++)) < 0) return SPDK_JSON_PARSE_INVALID;
79 	if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
80 
81 	val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12);
82 
83 	if (surrogate_high) {
84 		/* We already parsed the high surrogate, so this should be the low part. */
85 		if (!utf16_valid_surrogate_low(val)) {
86 			return SPDK_JSON_PARSE_INVALID;
87 		}
88 
89 		/* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */
90 		val = utf16_decode_surrogate_pair(surrogate_high, val);
91 	} else if (utf16_valid_surrogate_high(val)) {
92 		surrogate_high = val;
93 
94 		/*
95 		 * We parsed a \uXXXX sequence that decoded to the first half of a
96 		 *  UTF-16 surrogate pair, so it must be immediately followed by another
97 		 *  \uXXXX escape.
98 		 *
99 		 * Loop around to get the low half of the surrogate pair.
100 		 */
101 		if (buf_end == str) return SPDK_JSON_PARSE_INCOMPLETE;
102 		goto decode;
103 	} else if (utf16_valid_surrogate_low(val)) {
104 		/*
105 		 * We found the second half of surrogate pair without the first half;
106 		 *  this is an invalid encoding.
107 		 */
108 		return SPDK_JSON_PARSE_INVALID;
109 	}
110 
111 	/*
112 	 * Convert Unicode escape (or surrogate pair) to UTF-8 in place.
113 	 *
114 	 * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes
115 	 *  (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a
116 	 *  single codepoint is 4 bytes.
117 	 */
118 	if (out) {
119 		rc = utf8_encode_unsafe(out, val);
120 	} else {
121 		rc = utf8_codepoint_len(val);
122 	}
123 	if (rc < 0) {
124 		return SPDK_JSON_PARSE_INVALID;
125 	}
126 
127 	*strp = str; /* update input pointer */
128 	return rc; /* return number of bytes decoded */
129 }
130 
131 static int
132 json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
133 {
134 	static const uint8_t escapes[256] = {
135 		['b'] = '\b',
136 		['f'] = '\f',
137 		['n'] = '\n',
138 		['r'] = '\r',
139 		['t'] = '\t',
140 		['/'] = '/',
141 		['"'] = '"',
142 		['\\'] = '\\',
143 	};
144 	uint8_t *str = *strp;
145 	uint8_t c;
146 
147 	assert(buf_end > str);
148 	if (buf_end - str < 2) {
149 		return SPDK_JSON_PARSE_INCOMPLETE;
150 	}
151 
152 	assert(str[0] == '\\');
153 
154 	c = escapes[str[1]];
155 	if (c) {
156 		if (out) {
157 			*out = c;
158 		}
159 		*strp += 2; /* consumed two bytes */
160 		return 1; /* produced one byte */
161 	}
162 
163 	return SPDK_JSON_PARSE_INVALID;
164 }
165 
166 /*
167  * Decode JSON string backslash escape.
168  * \param strp pointer to pointer to first character of escape (the backslash).
169  *  *strp is also advanced to indicate how much input was consumed.
170  *
171  * \return Number of bytes appended to out
172  */
173 static int
174 json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
175 {
176 	int rc;
177 
178 	rc = json_decode_string_escape_twochar(strp, buf_end, out);
179 	if (rc > 0) {
180 		return rc;
181 	}
182 
183 	return json_decode_string_escape_unicode(strp, buf_end, out);
184 }
185 
186 /*
187  * Decode JSON string in place.
188  *
189  * \param str_start Pointer to the beginning of the string (the opening " character).
190  *
191  * \return Number of bytes in decoded string (beginning from start).
192  */
193 static int
194 json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags)
195 {
196 	uint8_t *str = str_start;
197 	uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */
198 	int rc;
199 
200 	if (buf_end - str_start < 2) {
201 		/*
202 		 * Shortest valid string (the empty string) is two bytes (""),
203 		 *  so this can't possibly be valid
204 		 */
205 		return SPDK_JSON_PARSE_INCOMPLETE;
206 	}
207 
208 	if (*str++ != '"') {
209 		return SPDK_JSON_PARSE_INVALID;
210 	}
211 
212 	while (str < buf_end) {
213 		if (str[0] == '"') {
214 			/*
215 			 * End of string.
216 			 * Update str_end to point at next input byte and return output length.
217 			 */
218 			*str_end = str + 1;
219 			return out - str_start - 1;
220 		} else if (str[0] == '\\') {
221 			rc = json_decode_string_escape(&str, buf_end,
222 						       flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL);
223 			assert(rc != 0);
224 			if (rc < 0) {
225 				return rc;
226 			}
227 			out += rc;
228 		} else if (str[0] <= 0x1f) {
229 			/* control characters must be escaped */
230 			return SPDK_JSON_PARSE_INVALID;
231 		} else {
232 			rc = utf8_valid(str, buf_end);
233 			if (rc == 0) {
234 				return SPDK_JSON_PARSE_INCOMPLETE;
235 			} else if (rc < 0) {
236 				return SPDK_JSON_PARSE_INVALID;
237 			}
238 
239 			if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) {
240 				memmove(out, str, rc);
241 			}
242 			out += rc;
243 			str += rc;
244 		}
245 	}
246 
247 	/* If execution gets here, we ran out of buffer. */
248 	return SPDK_JSON_PARSE_INCOMPLETE;
249 }
250 
251 static int
252 json_valid_number(uint8_t *start, uint8_t *buf_end)
253 {
254 	uint8_t *p = start;
255 	enum {
256 		NUM_STATE_START,
257 		NUM_STATE_INT_FIRST_DIGIT,
258 		NUM_STATE_INT_DIGITS,
259 		NUM_STATE_FRAC_OR_EXP,
260 		NUM_STATE_FRAC_FIRST_DIGIT,
261 		NUM_STATE_FRAC_DIGITS,
262 		NUM_STATE_EXP_SIGN,
263 		NUM_STATE_EXP_FIRST_DIGIT,
264 		NUM_STATE_EXP_DIGITS,
265 	} state = NUM_STATE_START;
266 
267 	if (p >= buf_end) return -1;
268 
269 	while (p != buf_end) {
270 		uint8_t c = *p++;
271 
272 		switch (c) {
273 		case '0':
274 			if (state == NUM_STATE_START || state == NUM_STATE_INT_FIRST_DIGIT) {
275 				/*
276 				 * If the very first digit is 0,
277 				 *  it must be the last digit of the integer part
278 				 *  (no leading zeroes allowed).
279 				 */
280 				state = NUM_STATE_FRAC_OR_EXP;
281 				break;
282 			}
283 		/* fallthrough */
284 		case '1':
285 		case '2':
286 		case '3':
287 		case '4':
288 		case '5':
289 		case '6':
290 		case '7':
291 		case '8':
292 		case '9':
293 			switch (state) {
294 			case NUM_STATE_START:
295 			case NUM_STATE_INT_FIRST_DIGIT:
296 				state = NUM_STATE_INT_DIGITS;
297 				break;
298 
299 			case NUM_STATE_FRAC_FIRST_DIGIT:
300 				state = NUM_STATE_FRAC_DIGITS;
301 				break;
302 
303 			case NUM_STATE_EXP_SIGN:
304 			case NUM_STATE_EXP_FIRST_DIGIT:
305 				state = NUM_STATE_EXP_DIGITS;
306 				break;
307 
308 			case NUM_STATE_INT_DIGITS:
309 			case NUM_STATE_FRAC_DIGITS:
310 			case NUM_STATE_EXP_DIGITS:
311 				/* stay in same state */
312 				break;
313 
314 			default:
315 				return SPDK_JSON_PARSE_INVALID;
316 			}
317 			break;
318 
319 		case '.':
320 			if (state != NUM_STATE_INT_DIGITS && state != NUM_STATE_FRAC_OR_EXP) {
321 				return SPDK_JSON_PARSE_INVALID;
322 			}
323 			state = NUM_STATE_FRAC_FIRST_DIGIT;
324 			break;
325 
326 		case 'e':
327 		case 'E':
328 			switch (state) {
329 			case NUM_STATE_INT_DIGITS:
330 			case NUM_STATE_FRAC_OR_EXP:
331 			case NUM_STATE_FRAC_DIGITS:
332 				state = NUM_STATE_EXP_SIGN;
333 				break;
334 			default:
335 				return SPDK_JSON_PARSE_INVALID;
336 			}
337 			break;
338 
339 		case '-':
340 			if (state == NUM_STATE_START) {
341 				state = NUM_STATE_INT_FIRST_DIGIT;
342 				break;
343 			}
344 		/* fallthrough */
345 		case '+':
346 			if (state == NUM_STATE_EXP_SIGN) {
347 				state = NUM_STATE_EXP_FIRST_DIGIT;
348 			} else {
349 				return SPDK_JSON_PARSE_INVALID;
350 			}
351 			break;
352 		default:
353 			/*
354 			 * Got an unexpected character - back up and stop parsing number.
355 			 * The top-level parsing code will handle invalid trailing characters.
356 			 */
357 			p--;
358 			goto done;
359 		}
360 	}
361 
362 done:
363 	switch (state) {
364 	case NUM_STATE_INT_DIGITS:
365 	case NUM_STATE_FRAC_OR_EXP:
366 	case NUM_STATE_FRAC_DIGITS:
367 	case NUM_STATE_EXP_DIGITS:
368 		/* Valid end state */
369 		return p - start;
370 
371 	default:
372 		return SPDK_JSON_PARSE_INCOMPLETE;
373 	}
374 }
375 
376 struct json_literal {
377 	enum spdk_json_val_type type;
378 	uint32_t len;
379 	uint8_t str[8];
380 };
381 
382 /*
383  * JSON only defines 3 possible literals; they can be uniquely identified by bits
384  *  3 and 4 of the first character:
385  *   'f' = 0b11[00]110
386  *   'n' = 0b11[01]110
387  *   't' = 0b11[10]100
388  * These two bits can be used as an index into the g_json_literals array.
389  */
390 static const struct json_literal g_json_literals[] = {
391 	{SPDK_JSON_VAL_FALSE, 5, "false"},
392 	{SPDK_JSON_VAL_NULL,  4, "null"},
393 	{SPDK_JSON_VAL_TRUE,  4, "true"},
394 	{}
395 };
396 
397 static int
398 match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len)
399 {
400 	assert(end >= start);
401 	if ((size_t)(end - start) < len) {
402 		return SPDK_JSON_PARSE_INCOMPLETE;
403 	}
404 
405 	if (memcmp(start, literal, len) != 0) {
406 		return SPDK_JSON_PARSE_INVALID;
407 	}
408 
409 	return len;
410 }
411 
412 ssize_t
413 spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values,
414 		void **end, uint32_t flags)
415 {
416 	uint8_t *json_end = json + size;
417 	enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH];
418 	size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH];
419 	enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID;
420 	bool trailing_comma = false;
421 	size_t depth = 0; /* index into containers */
422 	size_t cur_value = 0; /* index into values */
423 	size_t con_start_value;
424 	uint8_t *data = json;
425 	uint8_t *new_data;
426 	int rc;
427 	const struct json_literal *lit;
428 	enum {
429 		STATE_VALUE, /* initial state */
430 		STATE_VALUE_SEPARATOR, /* value separator (comma) */
431 		STATE_NAME, /* "name": value */
432 		STATE_NAME_SEPARATOR, /* colon */
433 		STATE_END, /* parsed the complete value, so only whitespace is valid */
434 	} state = STATE_VALUE;
435 
436 #define ADD_VALUE(t, val_start_ptr, val_end_ptr) \
437 	if (values && cur_value < num_values) { \
438 		values[cur_value].type = t; \
439 		values[cur_value].start = val_start_ptr; \
440 		values[cur_value].len = val_end_ptr - val_start_ptr; \
441 	} \
442 	cur_value++
443 
444 	while (data < json_end) {
445 		uint8_t c = *data;
446 
447 		switch (c) {
448 		case ' ':
449 		case '\t':
450 		case '\r':
451 		case '\n':
452 			/* Whitespace is allowed between any tokens. */
453 			data++;
454 			break;
455 
456 		case 't':
457 		case 'f':
458 		case 'n':
459 			/* true, false, or null */
460 			if (state != STATE_VALUE) return SPDK_JSON_PARSE_INVALID;
461 			lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */
462 			assert(lit->str[0] == c);
463 			rc = match_literal(data, json_end, lit->str, lit->len);
464 			if (rc < 0) return rc;
465 			ADD_VALUE(lit->type, data, data + rc);
466 			data += rc;
467 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
468 			trailing_comma = false;
469 			break;
470 
471 		case '"':
472 			if (state != STATE_VALUE && state != STATE_NAME) return SPDK_JSON_PARSE_INVALID;
473 			rc = json_decode_string(data, json_end, &new_data, flags);
474 			if (rc < 0) return rc;
475 			/*
476 			 * Start is data + 1 to skip initial quote.
477 			 * Length is data + rc - 1 to skip both quotes.
478 			 */
479 			ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME,
480 				  data + 1, data + rc - 1);
481 			data = new_data;
482 			if (state == STATE_NAME) {
483 				state = STATE_NAME_SEPARATOR;
484 			} else {
485 				state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
486 			}
487 			trailing_comma = false;
488 			break;
489 
490 		case '-':
491 		case '0':
492 		case '1':
493 		case '2':
494 		case '3':
495 		case '4':
496 		case '5':
497 		case '6':
498 		case '7':
499 		case '8':
500 		case '9':
501 			if (state != STATE_VALUE) return SPDK_JSON_PARSE_INVALID;
502 			rc = json_valid_number(data, json_end);
503 			if (rc < 0) return rc;
504 			ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc);
505 			data += rc;
506 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
507 			trailing_comma = false;
508 			break;
509 
510 		case '{':
511 		case '[':
512 			if (state != STATE_VALUE) return SPDK_JSON_PARSE_INVALID;
513 			if (depth == SPDK_JSON_MAX_NESTING_DEPTH) {
514 				return SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED;
515 			}
516 			if (c == '{') {
517 				con_type = SPDK_JSON_VAL_OBJECT_BEGIN;
518 				state = STATE_NAME;
519 			} else {
520 				con_type = SPDK_JSON_VAL_ARRAY_BEGIN;
521 				state = STATE_VALUE;
522 			}
523 			con_value[depth] = cur_value;
524 			containers[depth++] = con_type;
525 			ADD_VALUE(con_type, data, data + 1);
526 			data++;
527 			trailing_comma = false;
528 			break;
529 
530 		case '}':
531 		case ']':
532 			if (trailing_comma) return SPDK_JSON_PARSE_INVALID;
533 			if (depth == 0) return SPDK_JSON_PARSE_INVALID;
534 			con_type = containers[--depth];
535 			con_start_value = con_value[depth];
536 			if (values && con_start_value < num_values) {
537 				values[con_start_value].len = cur_value - con_start_value - 1;
538 			}
539 			if (c == '}') {
540 				if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) {
541 					return SPDK_JSON_PARSE_INVALID;
542 				}
543 				if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) {
544 					return SPDK_JSON_PARSE_INVALID;
545 				}
546 				ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1);
547 			} else {
548 				if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) {
549 					return SPDK_JSON_PARSE_INVALID;
550 				}
551 				if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) {
552 					return SPDK_JSON_PARSE_INVALID;
553 				}
554 				ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1);
555 			}
556 			con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1];
557 			data++;
558 			state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
559 			trailing_comma = false;
560 			break;
561 
562 		case ',':
563 			if (state != STATE_VALUE_SEPARATOR) return SPDK_JSON_PARSE_INVALID;
564 			data++;
565 			assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN ||
566 			       con_type == SPDK_JSON_VAL_OBJECT_BEGIN);
567 			state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME;
568 			trailing_comma = true;
569 			break;
570 
571 		case ':':
572 			if (state != STATE_NAME_SEPARATOR) return SPDK_JSON_PARSE_INVALID;
573 			data++;
574 			state = STATE_VALUE;
575 			break;
576 
577 		default:
578 			return SPDK_JSON_PARSE_INVALID;
579 		}
580 
581 		if (state == STATE_END) {
582 			break;
583 		}
584 	}
585 
586 	if (state == STATE_END) {
587 		/* Skip trailing whitespace */
588 		while (data < json_end) {
589 			uint8_t c = *data;
590 
591 			if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
592 				data++;
593 			} else {
594 				break;
595 			}
596 		}
597 
598 		/*
599 		 * These asserts are just for sanity checking - they are guaranteed by the allowed
600 		 *  state transitions.
601 		 */
602 		assert(depth == 0);
603 		assert(trailing_comma == false);
604 		assert(data <= json_end);
605 		if (end) {
606 			*end = data;
607 		}
608 		return cur_value;
609 	}
610 
611 	/* Invalid end state - ran out of data */
612 	if (end) {
613 		*end = data;
614 	}
615 	return SPDK_JSON_PARSE_INCOMPLETE;
616 }
617