1 /*
2 * Copyright (c) 2014-2020 Pavel Kalvoda <me@pavelkalvoda.com>
3 *
4 * libcbor is free software; you can redistribute it and/or modify
5 * it under the terms of the MIT license. See LICENSE for details.
6 */
7
8 #include "unicode.h"
9 #include <stdint.h>
10
11 #define UTF8_ACCEPT 0
12 #define UTF8_REJECT 1
13
14 static const uint8_t utf8d[] = {
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00..1f */
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20..3f */
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40..5f */
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60..7f */
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
28 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
29 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, /* 80..9f */
30 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
31 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
32 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* a0..bf */
33 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2,
34 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
35 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0..df */
36 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3,
37 0x3, 0x3, 0x4, 0x3, 0x3, /* e0..ef */
38 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
39 0x8, 0x8, 0x8, 0x8, 0x8, /* f0..ff */
40 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4,
41 0x6, 0x1, 0x1, 0x1, 0x1, /* s0..s0 */
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
44 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, /* s1..s2 */
45 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, /* s3..s4 */
48 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, /* s5..s6 */
51 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1,
52 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* s7..s8 */
54 };
55
56 /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann
57 * <bjoern@hoehrmann.de> */
58 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
_cbor_unicode_decode(uint32_t * state,uint32_t * codep,uint32_t byte)59 uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
60 uint32_t type = utf8d[byte];
61
62 *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
63 : (0xff >> type) & (byte);
64
65 *state = utf8d[256 + *state * 16 + type];
66 return *state;
67 }
68
_cbor_unicode_codepoint_count(cbor_data source,uint64_t source_length,struct _cbor_unicode_status * status)69 uint64_t _cbor_unicode_codepoint_count(cbor_data source, uint64_t source_length,
70 struct _cbor_unicode_status* status) {
71 *status =
72 (struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK};
73 uint32_t codepoint, state = UTF8_ACCEPT, res;
74 uint64_t pos = 0, count = 0;
75
76 for (; pos < source_length; pos++) {
77 res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
78
79 if (res == UTF8_ACCEPT) {
80 count++;
81 } else if (res == UTF8_REJECT) {
82 goto error;
83 }
84 }
85
86 /* Unfinished multibyte codepoint */
87 if (state != UTF8_ACCEPT) goto error;
88
89 return count;
90
91 error:
92 *status = (struct _cbor_unicode_status){.location = pos,
93 .status = _CBOR_UNICODE_BADCP};
94 return 0;
95 }
96