1da0d961cSdjm /*
2d3425be1Sdjm * Copyright (c) 2014-2020 Pavel Kalvoda <me@pavelkalvoda.com>
3da0d961cSdjm *
4da0d961cSdjm * libcbor is free software; you can redistribute it and/or modify
5da0d961cSdjm * it under the terms of the MIT license. See LICENSE for details.
6da0d961cSdjm */
7da0d961cSdjm
8da0d961cSdjm #include "unicode.h"
9*4dcc46c4Sdjm #include <stdint.h>
10da0d961cSdjm
11da0d961cSdjm #define UTF8_ACCEPT 0
12da0d961cSdjm #define UTF8_REJECT 1
13da0d961cSdjm
14da0d961cSdjm static const uint8_t utf8d[] = {
159e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00..1f */
189e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20..3f */
219e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40..5f */
249e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
259e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
269e5c2ddcSdjm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60..7f */
279e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
289e5c2ddcSdjm 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
299e5c2ddcSdjm 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, /* 80..9f */
309e5c2ddcSdjm 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
319e5c2ddcSdjm 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
329e5c2ddcSdjm 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* a0..bf */
339e5c2ddcSdjm 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2,
349e5c2ddcSdjm 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
359e5c2ddcSdjm 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0..df */
369e5c2ddcSdjm 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3,
379e5c2ddcSdjm 0x3, 0x3, 0x4, 0x3, 0x3, /* e0..ef */
389e5c2ddcSdjm 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
399e5c2ddcSdjm 0x8, 0x8, 0x8, 0x8, 0x8, /* f0..ff */
409e5c2ddcSdjm 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4,
419e5c2ddcSdjm 0x6, 0x1, 0x1, 0x1, 0x1, /* s0..s0 */
429e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
439e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
449e5c2ddcSdjm 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, /* s1..s2 */
459e5c2ddcSdjm 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1,
469e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
479e5c2ddcSdjm 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, /* s3..s4 */
489e5c2ddcSdjm 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,
499e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
509e5c2ddcSdjm 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, /* s5..s6 */
519e5c2ddcSdjm 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1,
529e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,
539e5c2ddcSdjm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* s7..s8 */
54da0d961cSdjm };
55da0d961cSdjm
569e5c2ddcSdjm /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann
579e5c2ddcSdjm * <bjoern@hoehrmann.de> */
58da0d961cSdjm /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
_cbor_unicode_decode(uint32_t * state,uint32_t * codep,uint32_t byte)59da0d961cSdjm uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
60da0d961cSdjm uint32_t type = utf8d[byte];
61da0d961cSdjm
629e5c2ddcSdjm *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
639e5c2ddcSdjm : (0xff >> type) & (byte);
64da0d961cSdjm
65da0d961cSdjm *state = utf8d[256 + *state * 16 + type];
66da0d961cSdjm return *state;
67da0d961cSdjm }
68da0d961cSdjm
_cbor_unicode_codepoint_count(cbor_data source,uint64_t source_length,struct _cbor_unicode_status * status)69*4dcc46c4Sdjm uint64_t _cbor_unicode_codepoint_count(cbor_data source, uint64_t source_length,
709e5c2ddcSdjm struct _cbor_unicode_status* status) {
719e5c2ddcSdjm *status =
729e5c2ddcSdjm (struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK};
73da0d961cSdjm uint32_t codepoint, state = UTF8_ACCEPT, res;
74*4dcc46c4Sdjm uint64_t pos = 0, count = 0;
75da0d961cSdjm
769e5c2ddcSdjm for (; pos < source_length; pos++) {
77da0d961cSdjm res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
78da0d961cSdjm
79da0d961cSdjm if (res == UTF8_ACCEPT) {
80da0d961cSdjm count++;
81da0d961cSdjm } else if (res == UTF8_REJECT) {
82da0d961cSdjm goto error;
83da0d961cSdjm }
84da0d961cSdjm }
85da0d961cSdjm
86da0d961cSdjm /* Unfinished multibyte codepoint */
879e5c2ddcSdjm if (state != UTF8_ACCEPT) goto error;
88da0d961cSdjm
89da0d961cSdjm return count;
90da0d961cSdjm
91da0d961cSdjm error:
929e5c2ddcSdjm *status = (struct _cbor_unicode_status){.location = pos,
939e5c2ddcSdjm .status = _CBOR_UNICODE_BADCP};
94*4dcc46c4Sdjm return 0;
95da0d961cSdjm }
96