xref: /openbsd-src/lib/libcbor/src/cbor/internal/unicode.c (revision 4dcc46c4d04180142eda526ce521dfb137776d05)
1da0d961cSdjm /*
2d3425be1Sdjm  * Copyright (c) 2014-2020 Pavel Kalvoda <me@pavelkalvoda.com>
3da0d961cSdjm  *
4da0d961cSdjm  * libcbor is free software; you can redistribute it and/or modify
5da0d961cSdjm  * it under the terms of the MIT license. See LICENSE for details.
6da0d961cSdjm  */
7da0d961cSdjm 
8da0d961cSdjm #include "unicode.h"
9*4dcc46c4Sdjm #include <stdint.h>
10da0d961cSdjm 
11da0d961cSdjm #define UTF8_ACCEPT 0
12da0d961cSdjm #define UTF8_REJECT 1
13da0d961cSdjm 
14da0d961cSdjm static const uint8_t utf8d[] = {
159e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
169e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
179e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 00..1f */
189e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
199e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
209e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 20..3f */
219e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
229e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
239e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 40..5f */
249e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
259e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
269e5c2ddcSdjm     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 60..7f */
279e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
289e5c2ddcSdjm     1,   1,   1,   1,   1,   9,   9,   9,   9,   9,   9,
299e5c2ddcSdjm     9,   9,   9,   9,   9,   9,   9,   9,   9,   9, /* 80..9f */
309e5c2ddcSdjm     7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
319e5c2ddcSdjm     7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
329e5c2ddcSdjm     7,   7,   7,   7,   7,   7,   7,   7,   7,   7, /* a0..bf */
339e5c2ddcSdjm     8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,
349e5c2ddcSdjm     2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
359e5c2ddcSdjm     2,   2,   2,   2,   2,   2,   2,   2,   2,   2, /* c0..df */
369e5c2ddcSdjm     0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3,
379e5c2ddcSdjm     0x3, 0x3, 0x4, 0x3, 0x3, /* e0..ef */
389e5c2ddcSdjm     0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
399e5c2ddcSdjm     0x8, 0x8, 0x8, 0x8, 0x8, /* f0..ff */
409e5c2ddcSdjm     0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4,
419e5c2ddcSdjm     0x6, 0x1, 0x1, 0x1, 0x1, /* s0..s0 */
429e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
439e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   0,   1,   1,   1,   1,
449e5c2ddcSdjm     1,   0,   1,   0,   1,   1,   1,   1,   1,   1, /* s1..s2 */
459e5c2ddcSdjm     1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,
469e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
479e5c2ddcSdjm     1,   2,   1,   1,   1,   1,   1,   1,   1,   1, /* s3..s4 */
489e5c2ddcSdjm     1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,
499e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
509e5c2ddcSdjm     1,   3,   1,   3,   1,   1,   1,   1,   1,   1, /* s5..s6 */
519e5c2ddcSdjm     1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,
529e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   3,   1,   1,   1,   1,
539e5c2ddcSdjm     1,   1,   1,   1,   1,   1,   1,   1,   1,   1, /* s7..s8 */
54da0d961cSdjm };
55da0d961cSdjm 
569e5c2ddcSdjm /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann
579e5c2ddcSdjm  * <bjoern@hoehrmann.de> */
58da0d961cSdjm /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
_cbor_unicode_decode(uint32_t * state,uint32_t * codep,uint32_t byte)59da0d961cSdjm uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
60da0d961cSdjm   uint32_t type = utf8d[byte];
61da0d961cSdjm 
629e5c2ddcSdjm   *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
639e5c2ddcSdjm                                    : (0xff >> type) & (byte);
64da0d961cSdjm 
65da0d961cSdjm   *state = utf8d[256 + *state * 16 + type];
66da0d961cSdjm   return *state;
67da0d961cSdjm }
68da0d961cSdjm 
_cbor_unicode_codepoint_count(cbor_data source,uint64_t source_length,struct _cbor_unicode_status * status)69*4dcc46c4Sdjm uint64_t _cbor_unicode_codepoint_count(cbor_data source, uint64_t source_length,
709e5c2ddcSdjm                                        struct _cbor_unicode_status* status) {
719e5c2ddcSdjm   *status =
729e5c2ddcSdjm       (struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK};
73da0d961cSdjm   uint32_t codepoint, state = UTF8_ACCEPT, res;
74*4dcc46c4Sdjm   uint64_t pos = 0, count = 0;
75da0d961cSdjm 
769e5c2ddcSdjm   for (; pos < source_length; pos++) {
77da0d961cSdjm     res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
78da0d961cSdjm 
79da0d961cSdjm     if (res == UTF8_ACCEPT) {
80da0d961cSdjm       count++;
81da0d961cSdjm     } else if (res == UTF8_REJECT) {
82da0d961cSdjm       goto error;
83da0d961cSdjm     }
84da0d961cSdjm   }
85da0d961cSdjm 
86da0d961cSdjm   /* Unfinished multibyte codepoint */
879e5c2ddcSdjm   if (state != UTF8_ACCEPT) goto error;
88da0d961cSdjm 
89da0d961cSdjm   return count;
90da0d961cSdjm 
91da0d961cSdjm error:
929e5c2ddcSdjm   *status = (struct _cbor_unicode_status){.location = pos,
939e5c2ddcSdjm                                           .status = _CBOR_UNICODE_BADCP};
94*4dcc46c4Sdjm   return 0;
95da0d961cSdjm }
96